From cf00a2f4423ccbfb0897eb0a86af4a69a7f0ae77 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Fri, 30 Oct 2020 15:19:02 +0300 Subject: [PATCH] [IE CLDNN] Added int8 output suppirt into bfyx_to_fsv16 fp kernel (#2906) --- .../convolution_kernel_bfyx_to_b_fs_yx_fsv16.cpp | 5 ++- .../cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl | 45 ++++++++++------------ .../clDNN/tests/test_cases/fusings_gpu_test.cpp | 26 +++++++++++++ 3 files changed, 51 insertions(+), 25 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_to_b_fs_yx_fsv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_to_b_fs_yx_fsv16.cpp index a553b67..fe397f4 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_to_b_fs_yx_fsv16.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_to_b_fs_yx_fsv16.cpp @@ -50,6 +50,8 @@ ParamsKey ConvolutionKernel_bfyx_to_bfyx_f16::GetSupportedKey() const { k.EnableInputDataType(Datatype::F32); k.EnableOutputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); k.EnableInputWeightsType(WeightsType::F16); k.EnableInputWeightsType(WeightsType::F32); k.EnableInputLayout(DataLayout::bfyx); @@ -67,6 +69,7 @@ ParamsKey ConvolutionKernel_bfyx_to_bfyx_f16::GetSupportedKey() const { k.EnableBatching(); k.EnableSubGroup(); k.EnableSubGroupShort(); + k.EnableDifferentTypes(); return k; } @@ -132,7 +135,7 @@ JitConstants ConvolutionKernel_bfyx_to_bfyx_f16::GetJitConstants(const convoluti auto blockWidth = dispatchData.cldnnStyle.blockWidth; if (!params.fused_ops.empty()) { - auto input_dt = GetUnitType(params); + auto input_dt = GetActivationType(params); FusedOpsConfiguration conf_vec = { "_VEC", {"b", "(f_block*16)", "y", "x"}, "dst", diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl index 400aa49..5206853 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2016-2019 Intel Corporation +// Copyright (c) 2016-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,13 +13,11 @@ // limitations under the License. #include "include/include_all.cl" -#include "include/unit_type.cl" #include "include/mmad.cl" #define FEATURE_SLICE_SIZE 16 -// OUTPUT_X_BLOCK_SIZE is one of 2, 4, 8 -#define UNIT_BLOCK_WRITEN(ptr, offset, val) CAT(UNIT_BLOCK_WRITE, OUTPUT_X_BLOCK_SIZE)(ptr, offset, val) +#define DT_OUTPUT_BLOCK_WRITEN(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE, ptr, offset, val) __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1))) @@ -43,9 +41,6 @@ KERNEL(convolution_bfyx_to_bfyx_f16)( const int x = (xy % X_BLOCKS) * OUTPUT_X_BLOCK_SIZE; const int y = (xy / X_BLOCKS); - typedef MAKE_VECTOR_TYPE(UNIT_TYPE, OUTPUT_X_BLOCK_SIZE) vec_t; - typedef MAKE_VECTOR_TYPE(UNIT_TYPE, 8) wei_t; - const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; @@ -104,12 +99,12 @@ KERNEL(convolution_bfyx_to_bfyx_f16)( bias_offset += split_idx * BIAS_LENGTH; # endif - vec_t dst = (vec_t)(UNIT_BLOCK_READ(biases, bias_offset)); + MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) dst = (MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE))(DT_INPUT_BLOCK_READ(biases, bias_offset)); #else - vec_t dst = UNIT_VAL_ZERO; + MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) dst = INPUT0_VAL_ZERO; #endif - UNIT_TYPE line_cache[INPUT0_FEATURE_NUM * INPUT_BLOCK_SIZE]; + INPUT0_TYPE line_cache[INPUT0_FEATURE_NUM * INPUT_BLOCK_SIZE]; for (int ic = 0; ic < INPUT0_FEATURE_NUM; ic++) { __attribute__((opencl_unroll_hint(INPUT_BLOCK_SIZE))) @@ -125,11 +120,10 @@ KERNEL(convolution_bfyx_to_bfyx_f16)( xb * input_x_pitch + yb * input_y_pitch]; else - line_cache[ic * INPUT_BLOCK_SIZE + i] = UNIT_VAL_ZERO; + line_cache[ic * INPUT_BLOCK_SIZE + i] = INPUT0_VAL_ZERO; } } - __attribute__((opencl_unroll_hint(FILTER_SIZE_Y))) for (int kh = 0; kh < FILTER_SIZE_Y; kh++) { @@ -138,10 +132,10 @@ KERNEL(convolution_bfyx_to_bfyx_f16)( { uint offset = filter_offset + kh * filter_y_pitch + kw * filter_x_pitch; - UNIT_TYPE wei[INPUT0_FEATURE_NUM]; + FILTER_TYPE wei[INPUT0_FEATURE_NUM]; __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM))) for (int ic = 0; ic < INPUT0_FEATURE_NUM; ic++) - wei[ic] = UNIT_BLOCK_READ(weights, offset + ic * filter_isv_pitch); + wei[ic] = DT_FILTER_BLOCK_READ(weights, offset + ic * filter_isv_pitch); __attribute__((opencl_unroll_hint(OUTPUT_X_BLOCK_SIZE))) for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) @@ -149,7 +143,7 @@ KERNEL(convolution_bfyx_to_bfyx_f16)( const uint buf_offset = (kw*DILATION_SIZE_X + STRIDE_SIZE_X * i + (kh) * INPUT_LINE_SIZE) / SUB_GROUP_SIZE; const uint buf_group = (kw*DILATION_SIZE_X + STRIDE_SIZE_X * i + (kh) * INPUT_LINE_SIZE) % SUB_GROUP_SIZE; - UNIT_TYPE src[INPUT0_FEATURE_NUM]; + INPUT0_TYPE src[INPUT0_FEATURE_NUM]; __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM))) for (int ic = 0; ic < INPUT0_FEATURE_NUM; ic++) { src[ic] = intel_sub_group_shuffle(line_cache[ic * INPUT_BLOCK_SIZE + buf_offset], buf_group); @@ -159,17 +153,20 @@ KERNEL(convolution_bfyx_to_bfyx_f16)( } } - dst = ACTIVATION(dst, ACTIVATION_PARAMS); + MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) res; +#ifndef HAS_FUSED_OPS + res = ACTIVATION(dst, ACTIVATION_PARAMS); +#endif #if OUTPUT_LEFTOVERS if ((f_block+1)*FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM) { for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) { #if HAS_FUSED_OPS FUSED_OPS_SCALAR; - dst[i] = FUSED_OPS_RESULT_SCALAR; + res[i] = FUSED_OPS_RESULT_SCALAR; #endif if ((f_block*FEATURE_SLICE_SIZE + lid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) - output[output_offset + i * output_x_pitch + lid] = dst[i]; + output[output_offset + i * output_x_pitch + lid] = res[i]; } } else @@ -178,17 +175,17 @@ KERNEL(convolution_bfyx_to_bfyx_f16)( if (x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X) { #if HAS_FUSED_OPS FUSED_OPS_VEC; - dst = FUSED_OPS_RESULT_VEC; + res = FUSED_OPS_RESULT_VEC; #endif - UNIT_BLOCK_WRITEN(output, output_offset, dst); + DT_OUTPUT_BLOCK_WRITEN(output, output_offset, res); } else { - const int x_tail = OUTPUT_SIZE_X - x; + const int x_tail = OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE; for (int i = 0; i < x_tail; i++) { #if HAS_FUSED_OPS - FUSED_OPS_SCALAR; - dst[i] = FUSED_OPS_RESULT_SCALAR; + FUSED_OPS_SCALAR; + res[i] = FUSED_OPS_RESULT_SCALAR; #endif - UNIT_BLOCK_WRITE(output, output_offset + i * output_x_pitch, dst[i]); + DT_OUTPUT_BLOCK_WRITE(output, output_offset + i * output_x_pitch, res[i]); } } } diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp index 9bf1774..2404cd5 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp @@ -472,6 +472,7 @@ public: #define CASE_CONV_FP32_11 {1, 32, 4, 5, 4}, {1, 16, 2, 3, 2}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx #define CASE_CONV_FP32_12 {1, 16, 4, 5, 4}, {1, 16, 2, 3, 2}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx #define CASE_CONV_FP32_13 {1, 16, 18, 5, 4}, {1, 16, 16, 3, 2}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx +#define CASE_CONV_FP32_14 {1, 3, 4, 5}, {1, 30, 2, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f32, format::bfyx, data_types::f32, format::bfyx, data_types::f32, format::bfyx #define CASE_CONV_FP16_1 {1, 15, 4, 5}, {1, 30, 2, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f16, format::bfyx, data_types::f16, format::bfyx, data_types::f16, format::bfyx #define CASE_CONV_FP16_2 {1, 16, 4, 5}, {1, 32, 2, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::os_is_yx_isv16_osv16, data_types::f16, format::bfyx @@ -853,6 +854,31 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_eltwise_b_fs_zyx_fsv16, bc_test_params{CASE_CONV_FP16_12, 2, 3}, }), ); +class conv_fp32_quantize_u8_first_conv : public ConvFusingTest {}; +TEST_P(conv_fp32_quantize_u8_first_conv, basic) { + auto p = GetParam(); + create_topologies(input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + reorder("reordered_input", "input", format::b_fs_yx_fsv16, p.data_type), + convolution("conv_prim", "reordered_input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_quantize_u8_first_conv, + ::testing::ValuesIn(std::vector{ + bc_test_params{CASE_CONV_FP32_14, 2, 3}, + }), ); + class conv_fp32_quantize_u8 : public ConvFusingTest {}; TEST_P(conv_fp32_quantize_u8, basic) { auto p = GetParam(); -- 2.7.4