From 34ba00174ea671ca4e8b22a334db4ce1153f148f Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Thu, 27 Aug 2020 23:06:20 +0300 Subject: [PATCH] [IE CLDNN] Cleanup part 2 (#1865) * [IE CLDNN] Removed some unused kernels and layouts * [IE CLDNN] Removed bsv4_fsv32 layout * [IE CLDNN] Removed remaining BF8_XY16 usages. Removed definitions.cl --- inference-engine/thirdparty/clDNN/api/layout.hpp | 18 +- inference-engine/thirdparty/clDNN/api/tensor.hpp | 27 +-- .../clDNN/kernel_selector/common/tensor_type.cpp | 32 --- .../clDNN/kernel_selector/common/tensor_type.h | 4 - .../concatenation/concatenation_kernel_ref.cpp | 2 - .../convolution/convolution_kernel_bfyx_1x1.cpp | 2 - .../convolution/convolution_kernel_imad.cpp | 3 - .../convolution_kernel_imad_byxf_af32_1x1.cpp | 138 ----------- .../convolution_kernel_imad_byxf_af32_1x1.h | 45 ---- ...convolution_kernel_imad_byxf_af32_depthwise.cpp | 143 ----------- .../convolution_kernel_imad_byxf_af32_depthwise.h | 46 ---- .../convolution/convolution_kernel_mmad.cpp | 93 ------- .../convolution/convolution_kernel_mmad.h | 45 ---- .../convolution_kernel_mmad_batched.cpp | 95 -------- .../convolution/convolution_kernel_mmad_batched.h | 39 --- .../convolution_kernel_mmad_batched_block.cpp | 165 ------------- .../convolution_kernel_mmad_batched_block.h | 38 --- .../convolution_kernel_mmad_batched_block_1x1.cpp | 171 ------------- .../convolution_kernel_mmad_batched_block_1x1.h | 38 --- ...onvolution_kernel_mmad_bfyx_to_b_fs_yx_fsv4.cpp | 1 - .../convolution/convolution_kernel_mmad_blocks.cpp | 270 --------------------- .../convolution/convolution_kernel_mmad_blocks.h | 59 ----- .../convolution_kernel_mmad_slm_2x14_rep4.cpp | 119 --------- .../convolution_kernel_mmad_slm_2x14_rep4.h | 39 --- .../convolution_kernel_mmad_slm_7x7_rep4.cpp | 128 ---------- .../convolution_kernel_mmad_slm_7x7_rep4.h | 39 --- .../convolution/convolution_kernel_selector.cpp | 10 - .../deconvolution_kernel_imad_along_f_tile_bfx.cpp | 9 +- .../deconvolution_kernel_imad_ref.cpp | 1 - .../eltwise/eltwise_kernel_vload8.cpp | 6 +- .../fully_connected_kernel_mmad.cpp | 3 +- .../actual_kernels/pooling/pooling_kernel_base.cpp | 2 +- .../pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp | 1 - .../pooling/pooling_kernel_gpu_byxf_af32.cpp | 89 ------- .../pooling/pooling_kernel_gpu_byxf_af32.h | 37 --- .../pooling/pooling_kernel_gpu_int8_ref.cpp | 2 - .../pooling/pooling_kernel_selector.cpp | 2 - .../quantize/quantize_kernel_scale_shift_opt.cpp | 2 - .../reorder_kernel_byxf_f32_to_byx8_f4_i8.cpp | 87 ------- .../reorder_kernel_byxf_f32_to_byx8_f4_i8.h | 32 --- .../reorder/reorder_kernel_fast_b1.cpp | 2 - .../reorder/reorder_kernel_selector.cpp | 2 - .../resample/resample_kernel_ref.cpp | 2 - .../core/cl_kernels/convolution_gpu_bfyx_1x1.cl | 16 +- .../convolution_gpu_byxf_af32_depthwise.cl | 214 ---------------- .../core/cl_kernels/convolution_gpu_mmad.cl | 124 ---------- .../core/cl_kernels/convolution_gpu_mmad_blocks.cl | 158 ------------ .../fused_conv_eltwise_gpu_af32_imad_1x1.cl | 163 ------------- .../core/cl_kernels/fused_conv_eltwise_gpu_imad.cl | 5 +- .../core/cl_kernels/include/fetch.cl | 101 -------- .../core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl | 2 +- .../core/cl_kernels/pooling_gpu_byxf_af32.cl | 189 --------------- .../core/cl_kernels/pooling_gpu_int8_ref.cl | 2 +- .../core/cl_kernels/pooling_gpu_ref.cl | 2 +- .../core/cl_kernels/reorder_biplanar_nv12.cl | 8 - .../reorder_data_byxf_f32_to_byx8_f4_i8.cl | 130 ---------- .../core/cl_kernels/reorder_data_fast_b1.cl | 4 - .../cl_kernels/reorder_data_to_yxfb_batched.cl | 4 - .../clDNN/kernel_selector/core/common/jitter.cpp | 2 - .../core/kernel_selector_common.cpp | 4 - .../thirdparty/clDNN/src/convolution.cpp | 34 --- .../thirdparty/clDNN/src/fused_conv_eltwise.cpp | 7 - .../thirdparty/clDNN/src/gpu/concatenation_gpu.cpp | 2 - .../thirdparty/clDNN/src/gpu/convolution_gpu.cpp | 8 - .../thirdparty/clDNN/src/gpu/definitions.cl | 192 --------------- .../thirdparty/clDNN/src/gpu/eltwise_gpu.cpp | 3 - .../clDNN/src/gpu/fully_connected_gpu.cpp | 2 - .../clDNN/src/gpu/fused_conv_eltwise_gpu.cpp | 16 +- .../thirdparty/clDNN/src/gpu/pooling_gpu.cpp | 3 - .../thirdparty/clDNN/src/gpu/quantize_gpu.cpp | 5 - .../thirdparty/clDNN/src/gpu/resample_gpu.cpp | 6 +- .../src/graph_optimizer/prepare_buffer_fusing.cpp | 4 - .../clDNN/src/graph_optimizer/prepare_padding.cpp | 3 - .../graph_optimizer/prepare_primitive_fusing.cpp | 5 +- .../clDNN/src/graph_optimizer/reorder_inputs.cpp | 64 ----- .../thirdparty/clDNN/src/include/to_string_utils.h | 8 - .../clDNN/src/kernel_selector_helper.cpp | 27 --- .../thirdparty/clDNN/src/layout_optimizer.cpp | 30 +-- .../thirdparty/clDNN/src/memory_pool.cpp | 3 +- .../thirdparty/clDNN/src/program_helpers.cpp | 6 +- .../tests/test_cases/convolution_gpu_test.cpp | 6 +- .../tests/test_cases/fully_connected_gpu_test.cpp | 12 +- .../test_cases/fused_conv_eltwise_gpu_test.cpp | 2 - .../clDNN/tests/test_cases/fusings_gpu_test.cpp | 99 +------- .../clDNN/tests/test_cases/pooling_gpu_test.cpp | 4 +- .../clDNN/tests/test_cases/resample_gpu_test.cpp | 2 - .../clDNN/tests/test_cases/tensor_test.cpp | 12 - 87 files changed, 39 insertions(+), 3742 deletions(-) delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_1x1.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_1x1.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_depthwise.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_depthwise.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_blocks.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_blocks.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_af32_depthwise.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_blocks.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_af32_imad_1x1.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_af32.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_byxf_f32_to_byx8_f4_i8.cl delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/definitions.cl diff --git a/inference-engine/thirdparty/clDNN/api/layout.hpp b/inference-engine/thirdparty/clDNN/api/layout.hpp index af2c407..ee2ac02 100644 --- a/inference-engine/thirdparty/clDNN/api/layout.hpp +++ b/inference-engine/thirdparty/clDNN/api/layout.hpp @@ -344,14 +344,6 @@ struct layout { tensor get_pitches() const { auto sizes = get_buffer_size().sizes(format); - if (format == format::byxf_af32) { - sizes[3] = align_to(sizes[3], 32); - } - - if (format == format::byx8_f4) { - sizes[3] = align_to(sizes[3], 4); - sizes[2] = align_to(sizes[2], 8); - } std::vector pitches(sizes.size(), tensor::value_type(1)); std::partial_sum(sizes.rbegin(), sizes.rend() - 1, pitches.rbegin() + 1, std::multiplies()); return {format, pitches}; @@ -394,15 +386,7 @@ struct layout { sizes[block_axis] = align_to(sizes[block_axis], block_size); } - if (this->format == cldnn::format::bf8_xy16 && !(is_aligned_to(sizes[1], 8) && is_aligned_to(sizes[2] * sizes[3], 16))) { - sizes[3] = align_to(sizes[2] * sizes[3], 16); - sizes[2] = 1; - } else if (this->format == cldnn::format::byxf_af32 && !(is_aligned_to(sizes[1], 32))) { - sizes[1] = align_to(sizes[1], 32); - } else if (this->format == cldnn::format::byx8_f4 && (!is_aligned_to(sizes[1], 4) || !is_aligned_to(sizes[2], 8))) { - sizes[1] = align_to(sizes[1], 4); - sizes[2] = align_to(sizes[2], 8); - } else if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4 && !(is_aligned_to(sizes[0], 8)) && !(is_aligned_to(sizes[1], 32))) { + if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4 && !(is_aligned_to(sizes[0], 8)) && !(is_aligned_to(sizes[1], 32))) { sizes[0] = align_to(sizes[0], 8); sizes[1] = align_to(sizes[1], 32); } else if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4_swizzled_by_4 && !(is_aligned_to(sizes[0], 32)) && !(is_aligned_to(sizes[1], 32))) { diff --git a/inference-engine/thirdparty/clDNN/api/tensor.hpp b/inference-engine/thirdparty/clDNN/api/tensor.hpp index 4c89063..71c7c3e 100644 --- a/inference-engine/thirdparty/clDNN/api/tensor.hpp +++ b/inference-engine/thirdparty/clDNN/api/tensor.hpp @@ -105,7 +105,6 @@ struct format { bs_fs_zyx_bsv16_fsv16, ///< format used for 3D blocked convolution (batch and features blocked by 16) bs_fs_yx_bsv16_fsv16, ///< format used for 2D blocked convolution (batch and features blocked by 16) fs_b_yx_fsv32, ///< format for input for fp16 primitives - fs_bs_yx_bsv4_fsv32, ///< format for batched input for primitives using MMAD b_fs_yx_fsv4, ///< format for input for IMAD convolutions bs_xs_xsv8_bsv8, ///< format used only for fully connected weights: bs - batch slice, ///< xs - x slice, bsv8 - 8 values of single slice. @@ -114,10 +113,6 @@ struct format { bs_x_bsv16, ///< format used only for fully connected weights fp16 batch=1 : bs - batch slice ///< (responses slice), bsv16 - 16 values of single batch slice, x - flattened plane of (fyx) ///< \n \image html bs_x_bsv16.jpg - byxf_af32, ///< format for input for primitives using MMAD - byx8_f4, ///< format for input for MMAD convolutions - bf8_xy16, ///< format used only for convolution 1x1 input, xy aligned to 16, f aligned to 8 - ///< \n \image html bf8_xy16.jpg b_fs_yx_32fp, ///< format for data for binary convolutions winograd_2x3_s1_data, ///< format used for input for winograd convolution, F(2,3) -- filter 3x3 with stride 1 nv12, ///< format for media nv12 input @@ -227,11 +222,7 @@ struct format { { bs_xs_xsv8_bsv8, { 1, 1, 1, 0, 0, "bx", "b?x??", {{2, 8}, {0, 8}}}}, { bs_xs_xsv8_bsv16, { 1, 1, 1, 0, 0, "bx", "b?x??", {{2, 8}, {0, 16}}}}, { bs_x_bsv16, { 1, 1, 1, 0, 0, "bx", "b?x??", {{0, 16}}}}, - { bf8_xy16, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {{1, 8}}}}, { winograd_2x3_s1_data, { 1, 1, 2, 0, 0, "bxyf", "bfxy?", {}}}, - { byxf_af32, { 1, 1, 2, 0, 0, "byxf", "bfxy?", {}}}, - { byx8_f4 , { 1, 1, 2, 0, 0, "byxf", "bfxy?", {}}}, - { fs_bs_yx_bsv4_fsv32, { 1, 1, 2, 0, 0, "fbyx", "bfxy?", {{0, 4}, {1, 32}}}}, { b_fs_yx_fsv4, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {{1, 4}}}}, { bfzyx, { 1, 1, 3, 0, 0, "bfzyx", "bfxyz", {}}}, { bfwzyx, { 1, 1, 4, 0, 0, "bfwzyx", "bfxyzw", {}}}, @@ -943,23 +934,7 @@ public: adjusted_coords[external_axis] /= block_size; } - if (fmt == cldnn::format::byxf_af32 && !(is_aligned_to(my_sizes[3], 32))) { - my_sizes[3] = align_to(my_sizes[3], 32); - } else if (fmt == cldnn::format::byx8_f4 && (!(is_aligned_to(my_sizes[3], 4)) || !(is_aligned_to(my_sizes[2], 8)))) { - my_sizes[3] = align_to(my_sizes[3], 4); - my_sizes[2] = align_to(my_sizes[2], 8); - } else if (fmt == cldnn::format::bf8_xy16) { - // Special case of blocked format, where xy is treated as one flattened dimension - auto flat_xy = adjusted_coords[3] + adjusted_coords[2] * my_sizes[3]; - - my_sizes.push_back(16); - my_sizes[3] = ceil_div(my_sizes[2] * my_sizes[3], 16); - my_sizes[2] = 1; - - adjusted_coords.push_back(flat_xy % 16); - adjusted_coords[3] = flat_xy / 16; - adjusted_coords[2] = 0; - } else if (fmt == cldnn::format::os_is_yx_isa8_osv8_isv4 && // TODO Fix offsets calculation for formats below + if (fmt == cldnn::format::os_is_yx_isa8_osv8_isv4 && // TODO Fix offsets calculation for formats below !(is_aligned_to(my_sizes[0], 8)) && !(is_aligned_to(my_sizes[1], 32))) { my_sizes[0] = align_to(my_sizes[0], 8); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.cpp index 217b11f..e8f3b67 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.cpp @@ -42,11 +42,7 @@ DataTensor::DataChannelArray DataTensor::dataChannelArray {{ { DataLayout::bs_fs_yx_bsv16_fsv16, { 0, 1, -1, -1, 2, 3 } }, { DataLayout::bs_f_bsv8__af8, { -1, -1, -1, -1, 0, 1 } }, { DataLayout::bs_f_bsv16__af8, { -1, -1, -1, -1, 0, 1 } }, - { DataLayout::bf8_xy16, { 0, 1, -1, -1, 2, 3 } }, { DataLayout::winograd_2x3_s1_data, { 2, 1, -1, -1, 0, 3 } }, - { DataLayout::byxf_af32, { 1, 2, -1, -1, 0, 3 } }, - { DataLayout::byx8_f4, { 1, 2, -1, -1, 0, 3 } }, - { DataLayout::fs_bs_yx_bsv4_fsv32, { 0, 1, -1, -1, 3, 2 } }, { DataLayout::b_fs_yx_fsv4, { 0, 1, -1, -1, 2, 3 } }, { DataLayout::bfzyx, { 0, 1, 2, -1, 3, 4 } }, { DataLayout::fs_b_yx_fsv32, { 0, 1, -1, -1, 3, 2 } }, @@ -167,26 +163,6 @@ NDims DataTensor::GetSimpleDims(const std::vector& d, DataLayout l) { assert(newDims.size() == 5); newDims[3] = RoundUp(newDims[3], 32); break; - case bf8_xy16: - assert(newDims.size() == 4); - newDims[1] = RoundUp(newDims[1], 8); - newDims[3] = RoundUp(newDims[2] * newDims[3], 16); - newDims[2] = 1; - break; - case byxf_af32: - assert(newDims.size() == 4); - newDims[0] = RoundUp(newDims[0], 32); - break; - case byx8_f4: - assert(newDims.size() == 4); - newDims[0] = RoundUp(newDims[0], 4); - newDims[1] = RoundUp(newDims[1], 8); - break; - case fs_bs_yx_bsv4_fsv32: - assert(newDims.size() == 4); - newDims[3] = RoundUp(newDims[3], 32); - newDims[2] = RoundUp(newDims[2], 4); - break; case b_fs_yx_32fp: assert(newDims.size() == 4); newDims[3] = RoundUp(newDims[3], 32); @@ -222,14 +198,6 @@ NDims DataTensor::GetSimpleDims(const std::vector& d, DataLayout l) { pitch *= newDims[i]; } - if (l == byxf_af32 || l == fs_bs_yx_bsv4_fsv32 || l == byx8_f4) { - ret[0].pitch = 1; - ret[1].pitch = ret[0].pitch * newDims[0]; - ret[2].pitch = ret[1].pitch * newDims[1]; - ret[3].pitch = ret[2].pitch * newDims[2]; - ret[4].pitch = ret[3].pitch * newDims[3]; - } - return ret; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h b/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h index 3a3ff41..af5c8a5 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h @@ -51,11 +51,7 @@ enum DataLayout { bs_fs_zyx_bsv16_fsv16, // batch, feature, 3D spatial. Blocks of 16 batch and channels bs_f_bsv8__af8, // for optimized FC bs_f_bsv16__af8, // for optimized FC - bf8_xy16, // for optimized conv1x1 winograd_2x3_s1_data, // winograd convolution input, F(2,3) -- filter 3x3 with stride 1 - byxf_af32, // for MMAD convolution - byx8_f4, // for MMAD convolution - fs_bs_yx_bsv4_fsv32, // for batched MMAD b_fs_yx_fsv4, // reordering format for swizzled input for convolution using IMAD bfzyx, // batch+feature+3D spatial fs_b_yx_fsv32, // for FP16 kernels, 32 features to avoid partial writes diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_ref.cpp index 77dc1f1..fa5cd64 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_ref.cpp @@ -41,7 +41,6 @@ ParamsKey ConcatenationKernelRef::GetSupportedKey() const { k.EnableInputLayout(DataLayout::byxf); k.EnableInputLayout(DataLayout::fyxb); k.EnableInputLayout(DataLayout::b_fs_yx_fsv16); - k.EnableInputLayout(DataLayout::byxf_af32); k.EnableInputLayout(DataLayout::b_fs_yx_fsv4); k.EnableInputLayout(DataLayout::b_fs_yx_fsv32); k.EnableOutputLayout(DataLayout::bf); @@ -51,7 +50,6 @@ ParamsKey ConcatenationKernelRef::GetSupportedKey() const { k.EnableOutputLayout(DataLayout::byxf); k.EnableOutputLayout(DataLayout::fyxb); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16); - k.EnableOutputLayout(DataLayout::byxf_af32); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32); k.EnableTensorOffset(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1.cpp index 7874629..b016fe7 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1.cpp @@ -25,11 +25,9 @@ ParamsKey ConvolutionKernel_bfyx_1x1::GetSupportedKey() const { k.EnableOutputDataType(Datatype::F32); k.EnableInputWeightsType(WeightsType::F16); k.EnableInputWeightsType(WeightsType::F32); - k.EnableInputLayout(DataLayout::bf8_xy16); k.EnableInputLayout(DataLayout::bfyx); k.EnableOutputLayout(DataLayout::bfyx); k.EnableOutputLayout(DataLayout::yxfb); - k.EnableOutputLayout(DataLayout::bf8_xy16); k.EnableTensorOffset(); k.EnableTensorPitches(); k.EnableDilation(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp index cb1d2d3..dd193c2 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp @@ -82,7 +82,6 @@ ParamsKey ConvolutionKernel_imad::GetSupportedKey() const { k.EnableInputLayout(DataLayout::b_fs_yx_fsv4); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4); - k.EnableOutputLayout(DataLayout::byxf_af32); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16); k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16); @@ -116,8 +115,6 @@ JitConstants ConvolutionKernel_imad::GetJitConstants(const convolution_params& p in_fsv = 4; else if (params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16) in_fsv = 16; - else if (params.inputs[0].GetLayout() == DataLayout::byxf_af32) - in_fsv = 32; mem_consts.AddConstants({ MakeJitConstant("_ID", RoundUp(input.Feature().v, in_fsv)), diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_1x1.cpp deleted file mode 100644 index 6fa7131..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_1x1.cpp +++ /dev/null @@ -1,138 +0,0 @@ -/* -// Copyright (c) 2019-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_kernel_imad_byxf_af32_1x1.h" - -static size_t GetTileLength(size_t out_xy, size_t out_f, size_t min_threads) { - for (int tile_len = 14; tile_len > 0; tile_len--) { - // Kernel writes 32 output features per HW thread - size_t threads = (out_xy / tile_len) * out_xy * out_f / 32; - // Chose largest valid tile with enough HW threads - if ((out_xy % tile_len == 0) && (threads >= min_threads)) { - return tile_len; - } - } - return 1; -} - -namespace kernel_selector { - -ParamsKey ConvolutionKernel_imad_byxf_af32_1x1::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputDataType(Datatype::UINT8); - k.EnableOutputDataType(Datatype::UINT8); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::byxf_af32); - k.EnableOutputLayout(DataLayout::byxf_af32); - k.EnableDifferentTypes(); - k.EnableDifferentInputWeightsTypes(); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableDilation(); - k.EnableBiasPerFeature(); - k.EnableBiasPerOutput(); - k.EnableNonBiasTerm(); - k.EnableBatching(); - k.EnableQuantization(QuantizationType::SYMMETRIC); - k.DisableTuning(); - return k; -} - -bool ConvolutionKernel_imad_byxf_af32_1x1::Validate(const Params& p, const optional_params& o) const { - if (!Parent::Validate(p, o)) { - return false; - } - - const auto& params = static_cast(p); - - if (params.filterSize.x != 1 || params.filterSize.y != 1) - return false; - - if (params.padding.x != 0 || params.padding.y != 0) - return false; - - if (params.output.Feature().v % 32 != 0) - return false; - - const auto& input = params.inputs[0]; - - // we do not support padded input - if (input.X().pad.Total() != 0 || input.Y().pad.Total() != 0) - return false; - - if (params.split != 1) - return false; - - if (params.groups != 1) - return false; - - return true; -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_byxf_af32_1x1::SetDefault(const convolution_params& arg, - int) const { - DispatchData runInfo = Parent::SetDefault(arg); - - // Sub-group size - constexpr size_t sub_group_size = 8; - - const auto of_maps = arg.output.Feature().v; - const size_t of_maps_per_batch = RoundUp(of_maps, 32); - const size_t of_maps_total = of_maps_per_batch * arg.output.Batch().v; - - // Need to have at least 4 HW threads per EU - const size_t tile_length = GetTileLength(arg.output.X().v, of_maps_total, arg.engineInfo.computeUnitsCount * 4); - runInfo.cldnnStyle.blockWidth = tile_length; - - runInfo.efficiency = FORCE_PRIORITY_1; - - runInfo.gws0 = arg.output.X().v * arg.output.Y().v / tile_length; - runInfo.gws1 = of_maps_total / 4; // TILE_DEPTH==4 - runInfo.gws2 = 1; - - runInfo.lws0 = 1; - runInfo.lws1 = sub_group_size; - runInfo.lws2 = 1; - - return runInfo; -} - -JitConstants ConvolutionKernel_imad_byxf_af32_1x1::GetJitConstants(const convolution_params& params, - const DispatchData& runInfo) const { - auto jit = Parent::GetJitConstants(params, runInfo); - - jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws1)); - jit.AddConstant(MakeJitConstant("TILE_LENGTH", runInfo.cldnnStyle.blockWidth)); - jit.AddConstant(MakeJitConstant("TILE_DEPTH", 4)); - - jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED")); - - if (!params.fused_ops.empty()) { - auto input_dt = GetActivationType(params); - FusedOpsConfiguration conf_scalar = {"", {"b", "f2", "y", "(x+i)"}, "res", input_dt, 1 }; - jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar})); - } - - return jit; -} - -KernelsData ConvolutionKernel_imad_byxf_af32_1x1::GetKernelsData(const Params& params, - const optional_params& options) const { - return GetTunedKernelsDataByIndex(params, options); -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_1x1.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_1x1.h deleted file mode 100644 index 48911ad..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_1x1.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2019 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "convolution_kernel_base.h" -#include - -namespace kernel_selector { - -class ConvolutionKernel_imad_byxf_af32_1x1 : public ConvolutionKernelBase { -public: - using Parent = ConvolutionKernelBase; - ConvolutionKernel_imad_byxf_af32_1x1() : ConvolutionKernelBase("fused_conv_eltwise_gpu_af32_imad_1x1") {} - virtual ~ConvolutionKernel_imad_byxf_af32_1x1() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; - bool Validate(const Params& p, const optional_params& o) const override; - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override { - return WeightsLayout::os_is_osv32_isv32_swizzled_by_4; - } - std::vector GetSupportedFusedOps() const override { - return { FusedOpType::ELTWISE, - FusedOpType::QUANTIZE, - FusedOpType::SCALE, - FusedOpType::ACTIVATION }; - } -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_depthwise.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_depthwise.cpp deleted file mode 100644 index 002c54d..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_depthwise.cpp +++ /dev/null @@ -1,143 +0,0 @@ -/* -// Copyright (c) 2016-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_kernel_imad_byxf_af32_depthwise.h" - -#define SIMD_SIZE 16 - -namespace kernel_selector { - -ParamsKey ConvolutionKernel_imad_byxf_af32_depthiwise::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableInputDataType(Datatype::UINT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::UINT8); - k.EnableOutputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::byxf_af32); - k.EnableOutputLayout(DataLayout::byxf_af32); - k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableDilation(); - k.EnableBiasPerFeature(); - k.EnableBiasPerOutput(); - k.EnableNonBiasTerm(); - k.EnableBatching(); - k.EnableSplitSupport(); - k.EnableQuantization(QuantizationType::SYMMETRIC); - k.EnableDepthwiseSeparableOpt(); - k.EnableDifferentTypes(); - k.EnableDifferentInputWeightsTypes(); - k.DisableTuning(); - k.EnableGroupedConvolution(); - return k; -} - -static size_t GetTileLength(size_t out_x) { - for (int i = 20; i >= 1; i--) { - if (out_x % i == 0) - return i; - } - return 1; -} - -static int GetSplit(size_t out_x, int stride) { - if (out_x >= 75) { - if (stride > 1) - return 1; - else - return 3; - } - - if (out_x == 38 && stride == 2) - return 2; - - if (out_x < 75) { - if (stride > 1) - return 1; - else if (out_x % 2 == 0) - return 2; - } - return 1; -} - -bool ConvolutionKernel_imad_byxf_af32_depthiwise::Validate(const Params& p, const optional_params& o) const { - if (!Parent::Validate(p, o)) { - return false; - } - - const convolution_params& cp = static_cast(p); - if (cp.inputs[0].Feature().v != cp.groups || cp.output.Feature().v != cp.groups || cp.groups == 1) { - return false; - } - - return true; -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_byxf_af32_depthiwise::SetDefault(const convolution_params& arg, - int) const { - DispatchData runInfo = Parent::SetDefault(arg); - - runInfo.efficiency = FORCE_PRIORITY_1; - - runInfo.gws0 = Align(arg.output.Feature().v, SIMD_SIZE) * arg.output.Batch().v; - runInfo.gws1 = arg.output.X().v / GetTileLength(arg.output.X().v); - runInfo.gws2 = CeilDiv(arg.output.Y().v, GetSplit(arg.output.Y().v, arg.stride.y)); - - std::vector local = { SIMD_SIZE, 1, 1 }; - - runInfo.lws0 = local[0]; - runInfo.lws1 = local[1]; - runInfo.lws2 = local[2]; - - return runInfo; -} - -JitConstants ConvolutionKernel_imad_byxf_af32_depthiwise::GetJitConstants(const convolution_params& params, - const DispatchData& runInfo) const { - auto jit = Parent::GetJitConstants(params, runInfo); - - jit.AddConstant(MakeJitConstant("ALIGNED_OFM", Align(params.output.Feature().v, SIMD_SIZE))); - jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", GetTileLength(params.output.X().v))); - jit.AddConstant(MakeJitConstant("SPLIT_Y", GetSplit(params.output.Y().v, params.stride.y))); - jit.AddConstant(MakeJitConstant("SIMD_SIZE", SIMD_SIZE)); - - if (params.output.Y().v % GetSplit(params.output.Y().v, params.stride.y) != 0) - jit.AddConstant(MakeJitConstant("SPLIT_LEFTOVERS", params.output.Y().v % GetSplit(params.output.Y().v, params.stride.y))); - - if (!params.fused_ops.empty()) { - auto input_dt = GetActivationType(params); - FusedOpsConfiguration conf_scalar = {"", {"b", "of", "(y+m)", "(x+l)"}, "res", input_dt, 1 }; - conf_scalar.SetLoopAxes({Tensor::DataChannelName::Y, Tensor::DataChannelName::X}); - jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar})); - } - - return jit; -} - - -KernelsData ConvolutionKernel_imad_byxf_af32_depthiwise::GetKernelsData(const Params& params, - const optional_params& options) const { - KernelsData kd = GetTunedKernelsDataByIndex(params, options); - if (!kd.empty()) - kd[0].estimatedTime = FORCE_PRIORITY_1; - return kd; -} - -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_depthwise.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_depthwise.h deleted file mode 100644 index 0e70c95..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_depthwise.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_kernel_base.h" -#include - -namespace kernel_selector { - -class ConvolutionKernel_imad_byxf_af32_depthiwise : public ConvolutionKernelBase { -public: - using Parent = ConvolutionKernelBase; - ConvolutionKernel_imad_byxf_af32_depthiwise() : ConvolutionKernelBase("convolution_gpu_byxf_af32_depthwise") {} - virtual ~ConvolutionKernel_imad_byxf_af32_depthiwise() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override; - bool Validate(const Params& p, const optional_params& o) const override; - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override { - return WeightsLayout::goiyx; - } - std::vector GetSupportedFusedOps() const override { - return { FusedOpType::ELTWISE, - FusedOpType::QUANTIZE, - FusedOpType::SCALE, - FusedOpType::ACTIVATION }; - } -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad.cpp deleted file mode 100644 index 5d96d9f..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad.cpp +++ /dev/null @@ -1,93 +0,0 @@ -/* -// Copyright (c) 2016-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_kernel_mmad.h" - -namespace kernel_selector { - -ParamsKey ConvolutionKernel_mmad::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableInputDataType(Datatype::UINT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::UINT8); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::byxf_af32); - k.EnableOutputLayout(DataLayout::byxf_af32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableDilation(); - k.EnableBiasPerFeature(); - k.EnableBiasPerOutput(); - k.EnableNonBiasTerm(); - k.EnableBatching(); - k.EnableSplitSupport(); - k.EnableQuantization(QuantizationType::SYMMETRIC); - k.EnableDifferentInputWeightsTypes(); - k.DisableTuning(); - k.EnableDifferentTypes(); - return k; -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad::SetDefault(const convolution_params& arg, int) const { - DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); - - constexpr size_t sub_group_size = 8; - - const auto of_maps = arg.output.Feature().v; - const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size); - - runInfo.efficiency = FORCE_PRIORITY_4; - - runInfo.gws0 = arg.output.X().v; - runInfo.gws1 = arg.output.Y().v; - runInfo.gws2 = of_threads_per_batch * arg.output.Batch().v; - - runInfo.lws0 = 1; - runInfo.lws1 = 1; - runInfo.lws2 = sub_group_size; - - return runInfo; -} - -JitConstants ConvolutionKernel_mmad::GetJitConstants(const convolution_params& params, - const DispatchData& runInfo) const { - auto jit = Parent::GetJitConstants(params, runInfo); - - jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2)); - - // pitch for special block format used in this kernel - const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32); - const size_t filter_ofm_block_pitch = - (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8; - jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch)); - - jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED")); - if (!params.fused_ops.empty()) { - auto input_dt = GetActivationType(params); - FusedOpsConfiguration conf_scalar = {"", {"b", "f", "y", "x"}, "res", input_dt, 1 }; - jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar})); - } - return jit; -} - -KernelsData ConvolutionKernel_mmad::GetKernelsData(const Params& params, const optional_params& options) const { - KernelsData kd = GetTunedKernelsDataByIndex(params, options); - if (!kd.empty()) - kd[0].estimatedTime = FORCE_PRIORITY_4; - return kd; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad.h deleted file mode 100644 index b5479ba..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_kernel_base.h" -#include - -namespace kernel_selector { - -class ConvolutionKernel_mmad : public ConvolutionKernelBase { -public: - using Parent = ConvolutionKernelBase; - ConvolutionKernel_mmad() : ConvolutionKernelBase("convolution_gpu_mmad") {} - virtual ~ConvolutionKernel_mmad() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override { - return WeightsLayout::os_is_yx_isa8_osv8_isv4; - } - std::vector GetSupportedFusedOps() const override { - return { FusedOpType::ELTWISE, - FusedOpType::QUANTIZE, - FusedOpType::SCALE, - FusedOpType::ACTIVATION }; - } -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.cpp deleted file mode 100644 index aa41576..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.cpp +++ /dev/null @@ -1,95 +0,0 @@ -/* -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_kernel_mmad_batched.h" - -namespace kernel_selector { - -ParamsKey ConvolutionKernel_mmad_batched::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableDilation(); - k.EnableBiasPerFeature(); - k.EnableNonBiasTerm(); - k.EnableBatching(); - k.EnableSplitSupport(); - k.EnableQuantization(QuantizationType::SYMMETRIC); - k.DisableTuning(); - return k; -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_batched::SetDefault(const convolution_params& arg, - int) const { - DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); - - constexpr size_t sub_group_size = 8; - - const auto of_maps = arg.output.Feature().v; - const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size); - - runInfo.efficiency = FORCE_PRIORITY_6; - - runInfo.gws0 = arg.output.X().v; - runInfo.gws1 = arg.output.Y().v; - runInfo.gws2 = of_threads_per_batch * ((arg.output.Batch().v + 3) / 4); - - runInfo.lws0 = 1; - runInfo.lws1 = 1; - runInfo.lws2 = sub_group_size; - - return runInfo; -} - -JitConstants ConvolutionKernel_mmad_batched::GetJitConstants(const convolution_params& params, - const DispatchData& runInfo) const { - auto jit = Parent::GetJitConstants(params, runInfo); - - jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2)); - - // pitch for special block format used in this kernel - const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32); - const size_t filter_ofm_block_pitch = - (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8; - jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch)); - - const size_t in_x_pitch = 32 * 4; - const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded(); - const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded(); - const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4); - const size_t in_offset = - in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before; - - jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch)); - jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch)); - jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset)); - return jit; -} - -KernelsData ConvolutionKernel_mmad_batched::GetKernelsData(const Params& params, const optional_params& options) const { - KernelsData kd = GetTunedKernelsDataByIndex(params, options); - if (!kd.empty()) - kd[0].estimatedTime = FORCE_PRIORITY_6; - return kd; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.h deleted file mode 100644 index d0dd79d..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_kernel_base.h" -#include - -namespace kernel_selector { - -class ConvolutionKernel_mmad_batched : public ConvolutionKernelBase { -public: - using Parent = ConvolutionKernelBase; - ConvolutionKernel_mmad_batched() : ConvolutionKernelBase("convolution_gpu_mmad_batched") {} - virtual ~ConvolutionKernel_mmad_batched() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override { - return WeightsLayout::os_is_yx_isa8_osv8_isv4; - } -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.cpp deleted file mode 100644 index 60d7c23..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.cpp +++ /dev/null @@ -1,165 +0,0 @@ -/* -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_kernel_mmad_batched_block.h" -#include "kernel_selector_utils.h" -#include - -namespace kernel_selector { - -ParamsKey ConvolutionKernel_mmad_batched_block::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableBatching(); - k.EnableQuantization(QuantizationType::SYMMETRIC); - k.DisableTuning(); - return k; -} - -struct block_params { - int32_t out_width; - int32_t out_height; - int32_t out_depth; -}; - -static block_params get_out_block_size(const convolution_params& p) { - if (p.filterSize.x == 3 && p.filterSize.y == 3) { - if (p.output.X().v == 7) - return {7, 1, 4}; - else if (p.output.X().v == 14) - return {7, 1, 4}; - else if (p.output.X().v == 28) - return {7, 1, 4}; - else if (p.output.X().v == 56) - return {8, 1, 4}; - } - - return {1, 1, 1}; -} - -WeightsLayout ConvolutionKernel_mmad_batched_block::GetPreferredWeightsLayout( - const convolution_params &cp) const { - auto block = get_out_block_size(cp); - if (block.out_depth == 4) - return WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4; - else - return WeightsLayout::os_is_yx_isa8_osv8_isv4; -} - -bool ConvolutionKernel_mmad_batched_block::Validate(const Params& p, const optional_params& o) const { - if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) { - return false; - } - const convolution_params& cp = static_cast(p); - - // if block sizes are 1x1, then this algorithm is probably not the best - auto block = get_out_block_size(cp); - if (block.out_width == 1 && block.out_height == 1) - return false; - - if (cp.output.X().v % block.out_width != 0) - return false; - if (cp.output.Y().v % block.out_height != 0) - return false; - - if (cp.filterSize.x == 1) - return false; - - return true; -} - -size_t static get_wg_batch_count(const convolution_params& params) { - if (params.inputs[0].Batch().v % 64 == 0) - return 16; // because we process 4 batches per SIMD - return 1; -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_batched_block::SetDefault(const convolution_params& arg, - int) const { - DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); - - constexpr size_t sub_group_size = 8; - - runInfo.efficiency = FORCE_PRIORITY_5; - - auto block = get_out_block_size(arg); - - runInfo.gws0 = arg.output.X().v / block.out_width; - runInfo.gws1 = arg.output.Y().v / block.out_height; - runInfo.gws2 = (arg.output.Feature().v) * ((arg.output.Batch().v + 3) / 4) / - block.out_depth; // process 4 output channels per Workitem - - runInfo.lws0 = 1; - runInfo.lws1 = 1; - runInfo.lws2 = sub_group_size * get_wg_batch_count(arg); - - return runInfo; -} - -JitConstants ConvolutionKernel_mmad_batched_block::GetJitConstants(const convolution_params& params, - const DispatchData& runInfo) const { - auto jit = Parent::GetJitConstants(params, runInfo); - - const int sub_group_size = 8; - jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size)); - - // pitch for special block format used in this kernel - const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32); - const size_t filter_ofm_block_pitch = - (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8; - jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch)); - - const size_t in_x_pitch = 32 * 4; - const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded(); - const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded(); - const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4); - const size_t in_offset = - in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before; - - jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch)); - jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch)); - jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset)); - - const size_t out_x_pitch = 32 * 4; - jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch)); - - auto block = get_out_block_size(params); - jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width)); - jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block.out_height)); - jit.AddConstant(MakeJitConstant("WEIGHTS_PER_WORKITEM", block.out_depth)); - - jit.AddConstant(MakeJitConstant("WG_BATCH_COUNT", get_wg_batch_count(params))); - - return jit; -} - -KernelsData ConvolutionKernel_mmad_batched_block::GetKernelsData(const Params& params, - const optional_params& options) const { - KernelsData kd = GetCommonKernelsData(params, options); - if (!kd.empty()) - kd[0].estimatedTime = FORCE_PRIORITY_5; - return kd; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.h deleted file mode 100644 index abc58ad..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_kernel_base.h" -#include - -namespace kernel_selector { - -class ConvolutionKernel_mmad_batched_block : public ConvolutionKernelBase { -public: - using Parent = ConvolutionKernelBase; - ConvolutionKernel_mmad_batched_block() : ConvolutionKernelBase("convolution_gpu_mmad_batched_block") {} - virtual ~ConvolutionKernel_mmad_batched_block() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - bool Validate(const Params& p, const optional_params& o) const override; - JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.cpp deleted file mode 100644 index 5b271b0..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.cpp +++ /dev/null @@ -1,171 +0,0 @@ -/* -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_kernel_mmad_batched_block_1x1.h" -#include "kernel_selector_utils.h" -#include - -namespace kernel_selector { - -ParamsKey ConvolutionKernel_mmad_batched_block_1x1::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableBatching(); - k.EnableQuantization(QuantizationType::SYMMETRIC); - k.DisableTuning(); - return k; -} - -struct block_params { - int32_t out_width; - int32_t out_height; - int32_t out_depth; -}; - -static block_params get_out_block_size(const convolution_params& p) { - if (p.output.X().v == 7) - return {7, 1, 4}; - else if (p.output.X().v == 14) - return {7, 1, 4}; - else if (p.output.X().v == 28) - return {4, 2, 4}; - else if (p.output.X().v == 56) - return {8, 1, 4}; - - return {1, 1, 1}; -} - -WeightsLayout ConvolutionKernel_mmad_batched_block_1x1::GetPreferredWeightsLayout( - const convolution_params &cp) const { - auto block = get_out_block_size(cp); - if (block.out_depth == 4) - return WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4; - else - return WeightsLayout::os_is_yx_isa8_osv8_isv4; -} - -bool ConvolutionKernel_mmad_batched_block_1x1::Validate(const Params& p, const optional_params& o) const { - if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) { - return false; - } - const convolution_params& cp = static_cast(p); - - // only for conv 1x1 - if (cp.filterSize.x != 1 || cp.filterSize.y != 1) - return false; - - // only for stride 1x1 - if (cp.stride.x != 1 || cp.stride.y != 1) - return false; - - // if block sizes are 1x1, then this algorithm is probably not the best - auto block = get_out_block_size(cp); - if (block.out_depth != 4) - return false; - - if (cp.output.X().v % block.out_width != 0) - return false; - if (cp.output.Y().v % block.out_height != 0) - return false; - - return true; -} - -size_t static get_wg_batch_count(const convolution_params& params) { - if (params.inputs[0].Batch().v % 64 == 0) - return 16; // because we process 4 batches per SIMD - return 1; -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_batched_block_1x1::SetDefault(const convolution_params& arg, - int) const { - DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); - - constexpr size_t sub_group_size = 8; - - runInfo.efficiency = FORCE_PRIORITY_3; - - auto block = get_out_block_size(arg); - - runInfo.gws0 = arg.output.X().v / block.out_width; - runInfo.gws1 = arg.output.Y().v / block.out_height; - runInfo.gws2 = (arg.output.Feature().v) * ((arg.output.Batch().v + 3) / 4) / - block.out_depth; // process 4 output channels per Workitem - - runInfo.lws0 = 1; - runInfo.lws1 = 1; - runInfo.lws2 = sub_group_size * get_wg_batch_count(arg); - - return runInfo; -} - -JitConstants ConvolutionKernel_mmad_batched_block_1x1::GetJitConstants(const convolution_params& params, - const DispatchData& runInfo) const { - auto jit = Parent::GetJitConstants(params, runInfo); - - const int sub_group_size = 8; - jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size)); - - // pitch for special block format used in this kernel - const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32); - const size_t filter_ofm_block_pitch = - (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8; - jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch)); - - const size_t in_x_pitch = 32 * 4; - const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded(); - const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded(); - const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4); - const size_t in_offset = - in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before; - - jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch)); - jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch)); - jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset)); - - const size_t out_x_pitch = 32 * 4; - const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded(); - - jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch)); - jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch)); - - auto block = get_out_block_size(params); - jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width)); - jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block.out_height)); - jit.AddConstant(MakeJitConstant("WEIGHTS_PER_WORKITEM", block.out_depth)); - - jit.AddConstant(MakeJitConstant("WG_BATCH_COUNT", get_wg_batch_count(params))); - - return jit; -} - -KernelsData ConvolutionKernel_mmad_batched_block_1x1::GetKernelsData(const Params& params, - const optional_params& options) const { - KernelsData kd = GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char"); - if (!kd.empty()) - kd[0].estimatedTime = FORCE_PRIORITY_3; - return kd; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.h deleted file mode 100644 index 78db7d9..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_kernel_base.h" -#include - -namespace kernel_selector { - -class ConvolutionKernel_mmad_batched_block_1x1 : public ConvolutionKernelBase { -public: - using Parent = ConvolutionKernelBase; - ConvolutionKernel_mmad_batched_block_1x1() : ConvolutionKernelBase("convolution_gpu_mmad_batched_block_1x1") {} - virtual ~ConvolutionKernel_mmad_batched_block_1x1() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - bool Validate(const Params& p, const optional_params& o) const override; - JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv4.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv4.cpp index 2aef384..f9110ac 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv4.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv4.cpp @@ -35,7 +35,6 @@ ParamsKey ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv4::GetSupportedKey() const { k.EnableInputLayout(DataLayout::bfyx); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4); - k.EnableOutputLayout(DataLayout::byxf_af32); k.EnableTensorOffset(); k.EnableTensorPitches(); k.EnableDilation(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_blocks.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_blocks.cpp deleted file mode 100644 index 90ff761..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_blocks.cpp +++ /dev/null @@ -1,270 +0,0 @@ -// Copyright (c) 2016-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "convolution_kernel_mmad_blocks.h" -#include -#include -#include -#include - -namespace kernel_selector { -ConvolutionKernel_mmad_blocks::ConvolutionKernel_mmad_blocks() : ConvolutionKernelBase("convolution_gpu_mmad_blocks") { - // Generate the dispatch options to the auto-tuner. - std::vector blockWidthSizes = {1, 2, 4, 5, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32}; - std::vector blockHeightSizes = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - std::vector prefetchSizes = {1, 2, 3, 4, 5, 6, 8, 10}; - std::vector executionModes = ConvolutionKernelBase::autoTuneOptions; - const size_t maxBlockSize = 240; - for (auto executionMode : executionModes) { - for (auto blockWidth : blockWidthSizes) { - for (auto blockHeight : blockHeightSizes) { - for (auto prefetch : prefetchSizes) { - if (blockWidth * blockHeight <= maxBlockSize) { - autoTuneOptions.emplace_back(AutoTuneOption{blockWidth, blockHeight, prefetch, executionMode}); - } - } - } - } - } -} - -ParamsKey ConvolutionKernel_mmad_blocks::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableInputDataType(Datatype::UINT8); - - k.EnableOutputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::UINT8); - k.EnableOutputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::F16); - - k.EnableInputWeightsType(WeightsType::INT8); - - k.EnableInputLayout(DataLayout::byxf_af32); - k.EnableOutputLayout(DataLayout::byxf_af32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); -// k.EnableDilation(); TODO: Add dilation support - k.EnableBiasPerFeature(); - k.EnableBiasPerOutput(); - k.EnableNonBiasTerm(); - k.EnableBatching(); - k.EnableSplitSupport(); - k.EnableQuantization(QuantizationType::SYMMETRIC); - k.EnableDifferentTypes(); - k.EnableDifferentInputWeightsTypes(); - k.DisableTuning(); - return k; -} - -bool ConvolutionKernel_mmad_blocks::Validate(const Params& p, const optional_params& o) const { - if (!Parent::Validate(p, o)) { - return false; - } - - return true; -} - -static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_t& block_x, size_t& block_y) { - // how many elements we will compute in each dimension - size_t computed_x = Align(output_x, block_x); - size_t computed_y = Align(output_y, block_y); - // how many simds we need in each dimension - size_t simds_x = computed_x / block_x; - size_t simds_y = computed_y / block_y; - // how many unused values we have in each dimension - size_t unused_x = computed_x - output_x; - size_t unused_y = computed_y - output_y; - - block_x -= unused_x / simds_x; - block_y -= unused_y / simds_y; -} - -ConvolutionKernel_mmad_blocks::AutoTuneOption ConvolutionKernel_mmad_blocks::GetAutoTuneOptions( - const Params& p, - int autoTuneIndex) const { - if ((autoTuneIndex >= 0) && (autoTuneIndex < static_cast(autoTuneOptions.size()))) { - return autoTuneOptions[autoTuneIndex]; - } - - // Sub-group size used by "convolution_gpu_mmad_blocks" kernel. - constexpr size_t sub_group_size = 16; - - AutoTuneOption option = {0, 0, 0, DEFAULT}; - - const convolution_params& cp = static_cast(p); - - if (cp.stride.x == 1 && cp.stride.y == 1) { - if (cp.filterSize.x == 1 && cp.filterSize.y == 1) { - option.blockWidth = 16; - option.blockHeight = 1; - option.prefetch = 4; - // if less than 16 values is required to compute one single row of output - // then each WI shall compute one single row to maximize reuse within SIMD subgroup (this gives very nice - // performance results) - } else if (cp.output.X().v + (cp.filterSize.x - 1) * cp.dilation.x < sub_group_size) { - option.blockWidth = cp.output.X().v; - option.blockHeight = 1; - option.prefetch = 4; - } else if (cp.filterSize.x < 5 && cp.filterSize.y < 5) { - option.blockWidth = sub_group_size - cp.filterSize.x + 1; - option.blockHeight = 2; - option.prefetch = 4; - } else { - option.blockWidth = 4; - option.blockHeight = 3; - option.prefetch = 4; - } - } else if (cp.stride.x == 2 && cp.stride.y == 2) { - option.blockWidth = 5; - option.blockHeight = 4; - option.prefetch = 4; - } else { - option.blockWidth = 4; - option.blockHeight = 3; - option.prefetch = 5; - // run_info.efficiency = FORCE_PRIORITY_7; // GEMM is better - } - - // if this is not 1x1 batch1 case then shrink filters, other way we're memory bound and it's best to use 16x1 block - // sizes - if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1) { - shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v, option.blockWidth, option.blockHeight); - } - - return option; -} - -static std::pair get_byxf_af32_req_input_block_dims(size_t output_block_width, - size_t output_block_height, - const uSize& filter_size, - const uSize& stride, - const uSize& dilation, - size_t sub_group_size = 8, - size_t read_chunk_size = 8, - size_t min_read_size = 8) { - assert(output_block_width > 0 && output_block_height > 0); - assert(stride.x > 0 && stride.y > 0); - assert(filter_size.x > 0 && filter_size.y > 0); - - // Number of elements in X dimension needed from input to compute output block without re-reading input. - size_t input_block_req_width = (output_block_width - 1) * stride.x + (filter_size.x - 1) * dilation.x + 1; - // Number of elements in Y dimension needed from input to compute output block without re-reading input. - size_t input_block_req_height = (output_block_height - 1) * stride.y + (filter_size.y - 1) * dilation.y + 1; - - // Required number of elements in X dimension rounded to nearest >= read chunk size. - size_t input_block_read_width = std::max(RoundUp(input_block_req_width, read_chunk_size), min_read_size); - // Number of sub-group-sized vectors of unit type needed to store input block. - size_t input_block_array_size = CeilDiv(input_block_req_height * input_block_read_width, sub_group_size); - - // size of our array per workitem - input_block_array_size = input_block_req_height * input_block_read_width; - return std::make_pair(input_block_array_size, input_block_read_width); -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_blocks::SetDefault(const convolution_params& cp, - int autoTuneIndex) const { - // Sub-group size used by "convolution_gpu_mmad_blocks" kernel. - constexpr size_t sub_group_size = 8; - - DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp); - - auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex); - runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth; - runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight; - runInfo.cldnnStyle.prefetch = tuneOptions.prefetch; - - auto input_block_dims = - get_byxf_af32_req_input_block_dims(runInfo.cldnnStyle.blockWidth, - runInfo.cldnnStyle.blockHeight, - cp.filterSize, - cp.stride, - cp.dilation, - sub_group_size, - runInfo.fp16UnitUsed ? sub_group_size : sub_group_size / 2, - sub_group_size); - runInfo.cldnnStyle.inputBlockArraySize = input_block_dims.first; - runInfo.cldnnStyle.inputBlockWidth = input_block_dims.second; - - const auto of_maps = cp.output.Feature().v; - const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size); - - runInfo.efficiency = FORCE_PRIORITY_3; - - runInfo.gws0 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth); - runInfo.gws1 = CeilDiv(cp.output.Y().v, runInfo.cldnnStyle.blockHeight); - runInfo.gws2 = of_threads_per_batch * cp.output.Batch().v; - - runInfo.lws0 = 1; - runInfo.lws1 = 1; - runInfo.lws2 = sub_group_size; - - return runInfo; -} - -JitConstants ConvolutionKernel_mmad_blocks::GetJitConstants(const convolution_params& params, - const DispatchData& runInfo) const { - auto jit = Parent::GetJitConstants(params, runInfo); - - jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2)); - jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", runInfo.cldnnStyle.blockWidth)); - jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", runInfo.cldnnStyle.blockHeight)); - jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", runInfo.cldnnStyle.inputBlockArraySize)); - jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", runInfo.cldnnStyle.inputBlockWidth)); - jit.AddConstant(MakeJitConstant("PREFETCH", runInfo.cldnnStyle.prefetch)); - - jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED")); - - // pitch for special block format used in this kernel - const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32); - const size_t filter_ofm_block_pitch = - (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8; - jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch)); - - if (!params.fused_ops.empty()) { - auto input_dt = GetActivationType(params); - FusedOpsConfiguration conf_scalar = {"", {"b", "f", "(y+br)", "(x+bc)"}, "res", input_dt, 1 }; - jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar})); - } - - return jit; -} - -KernelsData ConvolutionKernel_mmad_blocks::GetKernelsData(const Params& params, const optional_params& options) const { - KernelsData kd = GetTunedKernelsDataByIndex(params, options); - if (!kd.empty()) - kd[0].estimatedTime = FORCE_PRIORITY_2; - - return kd; -} - -KernelsData ConvolutionKernel_mmad_blocks::GetKernelsDataForAutoTune(const Params& params, - const optional_params& options) const { - if (!Validate(params, options)) { - return {}; - } - - KernelsData res = {}; - - for (size_t i = 0; i < autoTuneOptions.size(); i++) { - KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast(i)); - if (!kd.empty()) { - res.emplace_back(kd[0]); - } - } - - return res; -} - -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_blocks.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_blocks.h deleted file mode 100644 index 04287f9..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_blocks.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_kernel_base.h" -#include -#include - -namespace kernel_selector { - -class ConvolutionKernel_mmad_blocks : public ConvolutionKernelBase { -public: - using Parent = ConvolutionKernelBase; - ConvolutionKernel_mmad_blocks(); - virtual ~ConvolutionKernel_mmad_blocks() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - KernelsData GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - bool Validate(const Params& p, const optional_params& o) const override; - JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override { - return WeightsLayout::os_is_yx_isa8_osv8_isv4; - } - std::vector GetSupportedFusedOps() const override { - return { FusedOpType::ELTWISE, - FusedOpType::QUANTIZE, - FusedOpType::SCALE, - FusedOpType::ACTIVATION }; - } - -private: - struct AutoTuneOption { - size_t blockWidth; - size_t blockHeight; - size_t prefetch; - std::string exeMode; - }; - - AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const; - std::vector autoTuneOptions = {}; -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.cpp deleted file mode 100644 index 6eb2dad..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/* -// Copyright (c) 2016-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_kernel_mmad_slm_2x14_rep4.h" -#include "kernel_selector_utils.h" - -namespace kernel_selector { - -ParamsKey ConvolutionKernel_mmad_slm_2x14_rep4::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableBiasPerOutput(); - k.EnableNonBiasTerm(); - k.EnableBatching(); - k.EnableQuantization(QuantizationType::SYMMETRIC); - k.DisableTuning(); - return k; -} - -bool ConvolutionKernel_mmad_slm_2x14_rep4::Validate(const Params& p, const optional_params& o) const { - if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) { - return false; - } - - const convolution_params& cp = static_cast(p); - - if (cp.filterSize.x != 3 || cp.filterSize.y != 3) - return false; - - if (cp.inputs[0].X().v != 56 || cp.inputs[0].Y().v != 56) - return false; - - if (cp.stride.x != 1 || cp.stride.y != 1) - return false; - - return true; -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_slm_2x14_rep4::SetDefault(const convolution_params& arg, - int) const { - DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); - - runInfo.efficiency = FORCE_PRIORITY_1; - - const size_t rep_count = 4; - const size_t batch_per_wi = 1; - const size_t out_block_width = 14; - const size_t out_block_height = 2; - runInfo.gws0 = arg.output.Feature().v * - (arg.output.Batch().v / (rep_count * batch_per_wi)); // number of tiles needed to cover output width - runInfo.gws1 = ((arg.inputs[0].X().v / arg.stride.x) + (out_block_width - 1)) / out_block_width; - runInfo.gws2 = ((arg.inputs[0].Y().v / arg.stride.y) + (out_block_height - 1)) / out_block_height; - - runInfo.lws0 = 32; // depth - runInfo.lws1 = 1; // width - runInfo.lws2 = 4; // height - - return runInfo; -} - -JitConstants ConvolutionKernel_mmad_slm_2x14_rep4::GetJitConstants(const convolution_params& params, - const DispatchData& runInfo) const { - auto jit = ConvolutionKernelBase::GetJitConstants(params, runInfo); - - jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 8)); - - // pitch for special block format used in this kernel - const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32); - const size_t filter_ofm_block_pitch = - (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8; - jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch)); - - const size_t in_x_pitch = 32 * 4; - const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded(); - const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded(); - const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4); - const size_t in_offset = - in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before; - - jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch)); - jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch)); - jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset)); - - jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", 14)); - jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", 2)); - jit.AddConstant(MakeJitConstant("LOCAL_SIZE_X", runInfo.lws0)); - jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Y", runInfo.lws1)); - jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Z", runInfo.lws2)); - - return jit; -} - -KernelsData ConvolutionKernel_mmad_slm_2x14_rep4::GetKernelsData(const Params& params, - const optional_params& options) const { - return GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char"); -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.h deleted file mode 100644 index a8fdfc3..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_kernel_base.h" -#include - -namespace kernel_selector { - -class ConvolutionKernel_mmad_slm_2x14_rep4 : public ConvolutionKernelBase { -public: - ConvolutionKernel_mmad_slm_2x14_rep4() : ConvolutionKernelBase("convolution_gpu_mmad_slm_2x14_rep4") {} - virtual ~ConvolutionKernel_mmad_slm_2x14_rep4() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; - bool Validate(const Params& p, const optional_params& o) const override; - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override { - return WeightsLayout::os_is_yx_isa8_osv8_isv4; - } -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.cpp deleted file mode 100644 index ca4cf6e..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.cpp +++ /dev/null @@ -1,128 +0,0 @@ -/* -// Copyright (c) 2016-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_kernel_mmad_slm_7x7_rep4.h" -#include "kernel_selector_utils.h" - -namespace kernel_selector { - -ParamsKey ConvolutionKernel_mmad_slm_7x7_rep4::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableBiasPerOutput(); - k.EnableNonBiasTerm(); - k.EnableBatching(); - k.EnableQuantization(QuantizationType::SYMMETRIC); - k.DisableTuning(); - return k; -} - -bool ConvolutionKernel_mmad_slm_7x7_rep4::Validate(const Params& p, const optional_params& o) const { - if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) { - return false; - } - - const convolution_params& cp = static_cast(p); - - if (cp.filterSize.x != 3 || cp.filterSize.y != 3) - return false; - - if (cp.stride.x != 1 || cp.stride.y != 1) - return false; - - if (cp.inputs[0].X().v == 7 && cp.inputs[0].Y().v == 7) - return true; - - if (cp.inputs[0].X().v == 14 && cp.inputs[0].Y().v == 14) - return true; - - return false; -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_slm_7x7_rep4::SetDefault(const convolution_params& arg, - int) const { - DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); - - runInfo.efficiency = FORCE_PRIORITY_1; - - const size_t rep_count = 4; - const size_t batch_per_wi = 4; - const size_t out_block_width = 7; - // const size_t out_block_height = 1; - runInfo.gws0 = (arg.output.Feature().v * arg.output.Batch().v) / - (rep_count * batch_per_wi); // number of tiles needed to cover output width - runInfo.gws1 = ((arg.inputs[0].X().v / arg.stride.x) + (out_block_width - 1)) / out_block_width; - // since this kernel only apply to 7x7 sizes we need to manually set gws2 to 8 - runInfo.gws2 = Align(arg.inputs[0].Y().v, - 8); // 8;//((arg.inputs[0].Y().v / arg.stride.y) + (out_block_height - 1)) / out_block_height; - - runInfo.lws0 = 16; // depth - runInfo.lws1 = 1; // width - runInfo.lws2 = 8; // height - - return runInfo; -} - -JitConstants ConvolutionKernel_mmad_slm_7x7_rep4::GetJitConstants(const convolution_params& params, - const DispatchData& runInfo) const { - auto jit = ConvolutionKernelBase::GetJitConstants(params, runInfo); - - jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 8)); - - // pitch for special block format used in this kernel - const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32); - const size_t filter_ofm_block_pitch = - (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8; - jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch)); - - const size_t in_x_pitch = 32 * 4; - const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded(); - const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded(); - const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4); - const size_t in_offset = - in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before; - - const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded(); - - jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch)); - jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch)); - jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset)); - - jit.AddConstant(MakeJitConstant("OUT_X_PITCH", in_x_pitch)); - jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch)); - jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", 7)); - jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", 1)); - jit.AddConstant(MakeJitConstant("LOCAL_SIZE_X", runInfo.lws0)); - jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Y", runInfo.lws1)); - jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Z", 7)); // must be 7 since we process 7 in Y per workgroup - - return jit; -} - -KernelsData ConvolutionKernel_mmad_slm_7x7_rep4::GetKernelsData(const Params& params, - const optional_params& options) const { - return GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char"); -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.h deleted file mode 100644 index 5f766c7..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_kernel_base.h" -#include - -namespace kernel_selector { - -class ConvolutionKernel_mmad_slm_7x7_rep4 : public ConvolutionKernelBase { -public: - ConvolutionKernel_mmad_slm_7x7_rep4() : ConvolutionKernelBase("convolution_gpu_mmad_slm_7x7_rep4") {} - virtual ~ConvolutionKernel_mmad_slm_7x7_rep4() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; - bool Validate(const Params& p, const optional_params& o) const override; - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override { - return WeightsLayout::os_is_yx_isa8_osv8_isv4; - } -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp index e6a954d..e002c9f 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp @@ -33,9 +33,6 @@ #include "convolution_kernel_bfyx_1x1_gemm_buf.h" #include "convolution_kernel_winograd_2x3_s1_fused.h" #include "convolution_kernel_winograd_6x3_s1_fused.h" -#include "convolution_kernel_mmad.h" -#include "convolution_kernel_mmad_blocks.h" -#include "convolution_kernel_imad_byxf_af32_depthwise.h" #include "convolution_kernel_bfyx_depthwise_weights_lwg.h" #include "convolution_kernel_imad.h" #include "convolution_kernel_fs_byx_fsv32.h" @@ -51,7 +48,6 @@ #include "deformable_convolution_kernel_bfyx_interp.h" #include "convolution_kernel_b_fs_zyx_fsv16_fp32.h" #include "convolution_kernel_b_fs_zyx_fsv16_fp16.h" -#include "convolution_kernel_imad_byxf_af32_1x1.h" #include "convolution_kernel_imad_b_fs_yx_fsv4_1x1.h" #include "convolution_kernel_imad_b_fs_yx_fsv4_dw.hpp" #include "convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv4.h" @@ -119,12 +115,6 @@ convolution_kernel_selector::convolution_kernel_selector() { Attach(); Attach(); - // byxf_af32 int8 - Attach(); - Attach(); - Attach(); - Attach(); - // b_fs_yx_fsv4 kernels Attach(); Attach(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_along_f_tile_bfx.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_along_f_tile_bfx.cpp index 8eae48e..e7e49ab 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_along_f_tile_bfx.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_along_f_tile_bfx.cpp @@ -55,9 +55,6 @@ ParamsKey DeconvolutionKernel_imad_along_f_tile_bfx::GetSupportedKey() const { k.EnableInputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16); k.EnableOutputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16); - k.EnableInputLayout(DataLayout::byxf_af32); - k.EnableOutputLayout(DataLayout::byxf_af32); - k.EnableDifferentTypes(); k.EnableDifferentInputWeightsTypes(); k.EnableBatching(); @@ -179,9 +176,6 @@ JitConstants DeconvolutionKernel_imad_along_f_tile_bfx::GetJitConstants(const de input_tile_ifm_pitch = zyx_pitch_factor * 16 * 16; } input_in_tile_batch_pitch = 16; - } else if (in_layout == DataLayout::byxf_af32) { - input_tile_ifm_pitch = tile_ifm; - input_in_tile_batch_pitch = zyx_pitch_factor * Align(in.Feature().LogicalDimPadded(), 32); } jit.AddConstant(MakeJitConstant("INPUT_VALID_TILE_IFM_PITCH", input_tile_ifm_pitch != 0)); @@ -242,8 +236,7 @@ size_t DeconvolutionKernel_imad_along_f_tile_bfx::GetTileIFM(const deconvolution fsv = 16; } if (params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv32 - || params.inputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32 - || params.inputs[0].GetLayout() == DataLayout::byxf_af32) { + || params.inputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32) { fsv = 32; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_ref.cpp index 840d83a..da9b46f 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_ref.cpp @@ -41,7 +41,6 @@ ParamsKey DeconvolutionKernel_imad_ref::GetSupportedKey() const { k.EnableInputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16); k.EnableInputLayout(DataLayout::b_fs_yx_fsv32); k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32); - k.EnableInputLayout(DataLayout::byxf_af32); k.EnableAllOutputLayout(); k.EnableDifferentTypes(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_vload8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_vload8.cpp index 52a3790..63021ed 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_vload8.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_vload8.cpp @@ -43,14 +43,12 @@ bool EltwiseKernel_vload8::Validate(const Params& params, const optional_params& const auto& ewParams = static_cast(params); for (size_t i = 0; i < ewParams.inputs.size(); i++) { - if (ewParams.inputs[i].GetLayout() == DataLayout::fs_bs_yx_bsv4_fsv32 || - (ewParams.inputs[i].GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) || + if ((ewParams.inputs[i].GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) || (ewParams.inputs[i].GetLayout() == DataLayout::b_fs_zyx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) || ewParams.inputs[i].GetLayout() == DataLayout::fs_b_yx_fsv32) return false; } - if (ewParams.output.GetLayout() == DataLayout::fs_bs_yx_bsv4_fsv32 || - (ewParams.output.GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.output.Feature().v % 16 != 0) || + if ((ewParams.output.GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.output.Feature().v % 16 != 0) || (ewParams.output.GetLayout() == DataLayout::b_fs_zyx_fsv16 && ewParams.output.Feature().v % 16 != 0) || ewParams.output.GetLayout() == DataLayout::fs_b_yx_fsv32) return false; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp index 441a3a4..ceb6dc1 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp @@ -37,7 +37,6 @@ ParamsKey FullyConnectedKernelMMAD::GetSupportedKey() const { k.EnableDifferentTypes(); k.EnableInputLayout(DataLayout::bfyx); - k.EnableInputLayout(DataLayout::byxf_af32); k.EnableInputLayout(DataLayout::b_fs_yx_fsv32); k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32); k.EnableOutputLayout(DataLayout::bf); @@ -129,7 +128,7 @@ JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_par size_t input_y_pitch = input.Y().pitch; size_t input_z_pitch = input.Z().pitch; - if (input.GetLayout() == DataLayout::byxf_af32 || input.GetLayout() == DataLayout::bfyx) { + if (input.GetLayout() == DataLayout::bfyx) { jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", 32)); } else if (input.GetLayout() == DataLayout::b_fs_yx_fsv32 || input.GetLayout() == DataLayout::b_fs_zyx_fsv32) { input_x_pitch = 32; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp index c5fa8ca..d7821a6 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp @@ -136,7 +136,7 @@ PoolingKernelBase::DispatchData PoolingKernelBase::SetDefault(const pooling_para kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; if (output.GetLayout() == DataLayout::bfyx || output.GetLayout() == DataLayout::b_fs_yx_fsv4 || - output.GetLayout() == DataLayout::byxf || output.GetLayout() == DataLayout::byxf_af32 || + output.GetLayout() == DataLayout::byxf || output.GetLayout() == DataLayout::bfzyx || output.GetLayout() == DataLayout::b_fs_zyx_fsv16 || output.GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv16) { // Determine global work sizes. diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp index 6060236..6375f73 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp @@ -27,7 +27,6 @@ ParamsKey PoolingKerneGPU_b_fs_yx_fsv4::GetSupportedKey() const { k.EnableInputLayout(DataLayout::b_fs_yx_fsv4); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4); k.EnableOutputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::byxf_af32); k.EnableTensorOffset(); k.EnableTensorPitches(); k.EnableBatching(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.cpp deleted file mode 100644 index 70d1d65..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.cpp +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright (c) 2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "pooling_kernel_gpu_byxf_af32.h" - -namespace kernel_selector { -ParamsKey PoolingKerneGPU_byxf_af32::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableInputDataType(Datatype::UINT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::UINT8); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputLayout(DataLayout::byxf_af32); - k.EnableOutputLayout(DataLayout::byxf_af32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBatching(); - k.EnablePoolType(PoolType::MAX); - k.EnablePoolType(PoolType::AVG); - k.EnablePoolRemainder(PoolRemainder::FLOOR); - k.EnablePoolRemainder(PoolRemainder::CEIL); - k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED); - k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC); - k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC_WITH_PADDING); - k.EnableDifferentTypes(); - return k; -} - -PoolingKernelBase::DispatchData PoolingKerneGPU_byxf_af32::SetDefault(const pooling_params& params) const { - constexpr int simdSize = 8; - - DispatchData runInfo = PoolingKernelBase::SetDefault(params); - - runInfo.gws0 = params.output.X().v; - runInfo.gws1 = params.output.Y().v; - // we got byxf_af32 format, so if we process 4 features per workitem, that means we process 32 per simd, so divide - // by 4 and we end up with 8 - runInfo.gws2 = (RoundUp(params.output.Feature().v, 32) * params.output.Batch().v) / 4; - - runInfo.lws0 = 1; - runInfo.lws1 = 1; - runInfo.lws2 = simdSize; - - return runInfo; -} - -JitConstants PoolingKerneGPU_byxf_af32::GetJitConstants(const pooling_params& params, DispatchData kd) const { - JitConstants jit = PoolingKernelBase::GetJitConstants(params, kd); - - jit.AddConstant(MakeJitConstant("AS_INPUT_TYPE(val)", "as_" + toCLType(params.inputs[0].GetDType()) + "4(val)")); - jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION")); - jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); - - if (!params.fused_ops.empty()) { - auto input_dt = GetActivationType(params); - FusedOpsConfiguration conf = {"", - {"b", "f", "y", "x"}, - "fused_pool_result", - input_dt, - 4, - LoadType::LT_UNALIGNED, - BoundaryCheck::ENABLED, - IndexType::TENSOR_COORD, - Tensor::DataChannelName::FEATURE}; - jit.Merge(MakeFusedOpsJitConstants(params, { conf })); - } - - return jit; -} - - -KernelsData PoolingKerneGPU_byxf_af32::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options, FORCE_PRIORITY_1); -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h deleted file mode 100644 index 1ffc94b..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "pooling_kernel_base.h" -#include - -namespace kernel_selector { -class PoolingKerneGPU_byxf_af32 : public PoolingKernelBase { -public: - PoolingKerneGPU_byxf_af32() : PoolingKernelBase("pooling_gpu_byxf_af32") {} - virtual ~PoolingKerneGPU_byxf_af32() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; - DispatchData SetDefault(const pooling_params& params) const override; - std::vector GetSupportedFusedOps() const override { - return { FusedOpType::ELTWISE, - FusedOpType::QUANTIZE, - FusedOpType::SCALE, - FusedOpType::ACTIVATION }; - } -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.cpp index beedfe9..9df0eba 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.cpp @@ -29,7 +29,6 @@ ParamsKey PoolingKernelGPUInt8Ref::GetSupportedKey() const { k.EnableInputLayout(DataLayout::bfzyx); k.EnableInputLayout(DataLayout::yxfb); k.EnableInputLayout(DataLayout::byxf); - k.EnableInputLayout(DataLayout::byxf_af32); k.EnableInputLayout(DataLayout::b_fs_yx_fsv4); k.EnableInputLayout(DataLayout::b_fs_yx_fsv32); k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32); @@ -38,7 +37,6 @@ ParamsKey PoolingKernelGPUInt8Ref::GetSupportedKey() const { k.EnableOutputLayout(DataLayout::bfzyx); k.EnableOutputLayout(DataLayout::yxfb); k.EnableOutputLayout(DataLayout::byxf); - k.EnableOutputLayout(DataLayout::byxf_af32); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32); k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp index 27305ac..263f63a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp @@ -18,7 +18,6 @@ #include "pooling_kernel_gpu_byxf_opt.h" #include "pooling_kernel_gpu_bfyx_block_opt.h" #include "pooling_kernel_gpu_byxf_padding_opt.h" -#include "pooling_kernel_gpu_byxf_af32.h" #include "pooling_kernel_gpu_int8_ref.h" #include "pooling_kernel_gpu_b_fs_yx_fsv4.h" #include "pooling_kernel_gpu_fs_b_yx_fsv32.h" @@ -35,7 +34,6 @@ pooling_kernel_selector::pooling_kernel_selector() { Attach(); Attach(); Attach(); - Attach(); Attach(); Attach(); Attach(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_scale_shift_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_scale_shift_opt.cpp index 51a9600..6b52142 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_scale_shift_opt.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_scale_shift_opt.cpp @@ -40,14 +40,12 @@ ParamsKey QuantizeKernelScaleShift::GetSupportedKey() const { k.EnableInputLayout(DataLayout::b_fs_yx_fsv4); k.EnableInputLayout(DataLayout::b_fs_yx_fsv32); k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32); - k.EnableInputLayout(DataLayout::byxf_af32); k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16); k.EnableOutputLayout(DataLayout::bfyx); k.EnableOutputLayout(DataLayout::yxfb); k.EnableOutputLayout(DataLayout::bfzyx); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16); k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16); - k.EnableOutputLayout(DataLayout::byxf_af32); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32); k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.cpp deleted file mode 100644 index b97e7e2..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.cpp +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "reorder_kernel_byxf_f32_to_byx8_f4_i8.h" -#include "kernel_selector_utils.h" - -namespace kernel_selector { -ParamsKey reorder_kernel_byxf_f32_to_byx8_f4_i8::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::INT8); - k.EnableDifferentTypes(); - k.EnableInputLayout(DataLayout::byxf); - k.EnableOutputLayout(DataLayout::byx8_f4); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBatching(); - return k; -} - -bool reorder_kernel_byxf_f32_to_byx8_f4_i8::Validate(const Params& p, const optional_params& o) const { - if (!ReorderKernelBase::Validate(p, o)) { - return false; - } - - const reorder_params& params = static_cast(p); - - if (params.output.X().v % 16 != 0) - return false; - - if (params.inputs[0].Feature().v != 3) - return false; - - if (params.mode == MeanSubtractMode::IN_BUFFER && params.mean.LogicalSize() != params.inputs[0].Feature().v) - return false; - - return true; -} - -size_t static get_wg_batch_size(const reorder_params& params) { - if (params.inputs[0].Batch().v % 16 == 0) - return 16; - return 1; -} - -reorder_kernel_byxf_f32_to_byx8_f4_i8::DispatchData reorder_kernel_byxf_f32_to_byx8_f4_i8::SetDefault( - const reorder_params& params) const { - DispatchData kd; - - const auto& input = params.inputs[0]; - - kd.gws0 = input.X().v; - kd.gws1 = input.Y().v; - kd.gws2 = input.Batch().v; - - kd.lws0 = 16; - kd.lws1 = 1; - kd.lws2 = get_wg_batch_size(params); - - return kd; -} - -JitConstants reorder_kernel_byxf_f32_to_byx8_f4_i8::GetJitConstants(const reorder_params& params) const { - auto jit = ReorderKernelBase::GetJitConstants(params); - jit.Merge(GetTensorFriendlyWorkGroupsJit(params.inputs[0])); - jit.AddConstant(MakeJitConstant("WG_BATCH_SIZE", get_wg_batch_size(params))); - return jit; -} - -KernelsData reorder_kernel_byxf_f32_to_byx8_f4_i8::GetKernelsData(const Params& params, - const optional_params& options) const { - const reorder_params& orgParams = static_cast(params); - return GetCommonKernelsData(orgParams, options, FORCE_PRIORITY_5); -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.h deleted file mode 100644 index 032ff8f..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "reorder_kernel_base.h" - -namespace kernel_selector { -class reorder_kernel_byxf_f32_to_byx8_f4_i8 : public ReorderKernelBase { -public: - reorder_kernel_byxf_f32_to_byx8_f4_i8() : ReorderKernelBase("reorder_data_byxf_f32_to_byx8_f4_i8") {} - virtual ~reorder_kernel_byxf_f32_to_byx8_f4_i8() {} - - bool Validate(const Params& p, const optional_params& o) const override; - DispatchData SetDefault(const reorder_params& params) const override; - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - JitConstants GetJitConstants(const reorder_params& params) const override; -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_fast_b1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_fast_b1.cpp index 376f591..9bdc21c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_fast_b1.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_fast_b1.cpp @@ -33,7 +33,6 @@ ParamsKey ReorderKernelFastBatch1::GetSupportedKey() const { k.EnableInputLayout(DataLayout::bfwzyx); k.EnableInputLayout(DataLayout::bs_f_bsv8__af8); k.EnableInputLayout(DataLayout::bs_f_bsv16__af8); - k.EnableInputLayout(DataLayout::bf8_xy16); k.EnableInputLayout(DataLayout::b_fs_yx_fsv16); k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16); @@ -45,7 +44,6 @@ ParamsKey ReorderKernelFastBatch1::GetSupportedKey() const { k.EnableOutputLayout(DataLayout::bfwzyx); k.EnableOutputLayout(DataLayout::bs_f_bsv8__af8); k.EnableOutputLayout(DataLayout::bs_f_bsv16__af8); - k.EnableOutputLayout(DataLayout::bf8_xy16); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16); k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_selector.cpp index fd09861..c3ffb20 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_selector.cpp @@ -19,7 +19,6 @@ #include "reorder_from_winograd_2x3_kernel.h" #include "reorder_to_winograd_2x3_kernel.h" #include "reorder_kernel_to_yxfb_batched.h" -#include "reorder_kernel_byxf_f32_to_byx8_f4_i8.h" #include "reorder_kernel_binary.h" #include "reorder_biplanar_nv12.h" #include "reorder_kernel_fs_b_yx_fsv32_to_bfyx.h" @@ -33,7 +32,6 @@ reorder_kernel_selector::reorder_kernel_selector() { Attach(); Attach(); Attach(); - Attach(); Attach(); Attach(); } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_ref.cpp index 2ee687f..d7d7484 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_ref.cpp @@ -61,8 +61,6 @@ static size_t packing_factor(const resample_params& params) { return 16; case DataLayout::b_fs_yx_fsv4: return 4; - case DataLayout::byxf_af32: - return 16; default: break; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_1x1.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_1x1.cl index 53ac874..5e7f52a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_1x1.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_1x1.cl @@ -17,7 +17,7 @@ #if FP16_UNIT_USED #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset))) - + #define MULTIPLY_BLOCKS_16x8_8x16(_result, _blockA, _blockB) \ { \ const half16 acol0 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s0 ); \ @@ -64,9 +64,9 @@ __attribute__((intel_reqd_sub_group_size(16))) KERNEL(convolution_bfyx_1x1)( - __global INPUT0_TYPE* input, - __global OUTPUT_TYPE* output, - __global FILTER_TYPE* weights, + __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global FILTER_TYPE* weights, #if BIAS_TERM __global BIAS_TYPE* biases, #endif @@ -107,10 +107,10 @@ KERNEL(convolution_bfyx_1x1)( { MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA00; MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB00; - + uint input_idx = input_offset + k * 8 * xy_block_num * 16; uint filter_idx = filter_offset + k * 8 * 16; - + blockA00 = ALIGNED_BLOCK_READ8(input, input_idx); blockB00 = ALIGNED_BLOCK_READ8(weights, filter_idx); @@ -124,11 +124,7 @@ KERNEL(convolution_bfyx_1x1)( for(uint i = 0; i < 16; i++) { - #if OUTPUT_LAYOUT_BF8_XY16 - const uint dst_index = GET_DATA_BF8_XY16_INDEX(OUTPUT, b, group_f+i, y, x) + out_split_offset; - #else const uint dst_index = GET_DATA_INDEX(OUTPUT, b, group_f+i, y, x) + out_split_offset; - #endif #if LEFTOVERS if(group_f+i < OUTPUT_FEATURE_NUM) #endif diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_af32_depthwise.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_af32_depthwise.cl deleted file mode 100644 index 3c94e5b..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_af32_depthwise.cl +++ /dev/null @@ -1,214 +0,0 @@ -// Copyright (c) 2016-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/common.cl" - -#include "include/data_types.cl" -#include "include/fetch.cl" -#include "include/mmad.cl" - -#if STRIDE_SIZE_Y == DILATION_SIZE_Y - #define BLOCK_Y_SIZE (FILTER_SIZE_Y + (SPLIT_Y - 1)) - #define LOAD_Y_WITH_STRIDES -#else - #define BLOCK_Y_SIZE ((SPLIT_Y - 1) * STRIDE_SIZE_Y + (FILTER_SIZE_Y - 1) * (DILATION_SIZE_Y - 1) + FILTER_SIZE_Y) -#endif - -#if STRIDE_SIZE_X == DILATION_SIZE_X - #define FILTER_SIZE_X_PRELOAD FILTER_SIZE_X - #define LOAD_X_WITH_STRIDES -#else - #define FILTER_SIZE_X_PRELOAD FILTER_SIZE_X - #define LOAD_X_WITH_STRIDES - #define DONT_USE_X_SHIFTS -#endif - -__attribute__((intel_reqd_sub_group_size(SIMD_SIZE))) -KERNEL(convolution_gpu_byxf_af32_depthwise)( - __global INPUT0_TYPE* input, - __global OUTPUT_TYPE* output, - __global FILTER_TYPE* weights, -#if BIAS_TERM - __global BIAS_TYPE* biases, -#endif -#if HAS_FUSED_OPS_DECLS - FUSED_OPS_DECLS, -#endif - uint split_idx) -{ - const uint x = get_global_id(1) * OUT_BLOCK_WIDTH; - const uint y = get_global_id(2) * SPLIT_Y; -#if OUTPUT_BATCH_NUM == 1 - const uint of = get_global_id(0); - const uint b = 0; -#else - const uint of = (uint)get_global_id(0) % ALIGNED_OFM; - const uint b = (uint)get_global_id(0) / ALIGNED_OFM; -#endif - const uint g = of; - - if (of >= OUTPUT_FEATURE_NUM) - return; - - int dotProd[SPLIT_Y] = {0}; - OUTPUT_TYPE out[SPLIT_Y]; - const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; - const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; - - const uint filter_offset = g*FILTER_GROUPS_PITCH; - const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + g*FILTER_IFM_NUM; - - // read all weights - FILTER_TYPE w[FILTER_IFM_PITCH]; - __attribute__((opencl_unroll_hint(FILTER_SIZE_Y))) - for (int j = 0; j < FILTER_SIZE_Y; j++) { - __attribute__((opencl_unroll_hint(FILTER_SIZE_X))) - for (int i = 0; i < FILTER_SIZE_X; i++) { - w[j * FILTER_SIZE_X + i] = weights[filter_offset + j * FILTER_Y_PITCH + i * FILTER_X_PITCH]; - } - } - - // initial input read - INPUT0_TYPE in[FILTER_SIZE_X_PRELOAD * BLOCK_Y_SIZE]; - __attribute__((opencl_unroll_hint(BLOCK_Y_SIZE))) - for (int i = 0; i < BLOCK_Y_SIZE; i++) { - __attribute__((opencl_unroll_hint(FILTER_SIZE_X_PRELOAD))) - for (int j = 0; j < FILTER_SIZE_X_PRELOAD; j++) { -#ifdef LOAD_Y_WITH_STRIDES - int input_offset_y = input_y + i * DILATION_SIZE_Y; -#else - int input_offset_y = input_y + i; -#endif -#ifdef LOAD_X_WITH_STRIDES - int input_offset_x = input_x + j * DILATION_SIZE_X; -#else - int input_offset_x = input_x + j; -#endif - uint input_idx = input_offset + (uint)input_offset_x * INPUT0_X_PITCH + (uint)input_offset_y * INPUT0_Y_PITCH; - in[i * FILTER_SIZE_X_PRELOAD + j] = input[input_idx]; - } - } - -#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD - FUSED_OPS_PRELOAD; -#endif - - for (int l = 0; l < OUT_BLOCK_WIDTH; l++) { - //calculate dotproduct - __attribute__((opencl_unroll_hint(SPLIT_Y))) - for (int i = 0; i < SPLIT_Y; i++) { - __attribute__((opencl_unroll_hint(FILTER_IFM_PITCH))) - for (int j = 0; j < FILTER_IFM_PITCH; j++) { -#if defined(LOAD_X_WITH_STRIDES) && defined(LOAD_Y_WITH_STRIDES) - const uint start_pos_y = i * FILTER_SIZE_X_PRELOAD; - dotProd[i] += (int)in[start_pos_y + j] * (int)w[j]; -#elif defined(LOAD_X_WITH_STRIDES) && !defined(LOAD_Y_WITH_STRIDES) - const uint start_pos_y = i * STRIDE_SIZE_Y * FILTER_SIZE_X_PRELOAD; - const uint pos_y = start_pos_y + (j / FILTER_SIZE_X) * DILATION_SIZE_Y * FILTER_SIZE_X_PRELOAD; - const uint pos_x = (j % FILTER_SIZE_X); - dotProd[i] += (int)in[pos_y + pos_x] * (int)w[j]; -#elif defined(LOAD_Y_WITH_STRIDES) && !defined(LOAD_X_WITH_STRIDES) - const uint start_pos_y = i * FILTER_SIZE_X_PRELOAD; - const uint pos_y = start_pos_y + (j / FILTER_SIZE_X) * FILTER_SIZE_X_PRELOAD; - const uint pos_x = (j % FILTER_SIZE_X) * DILATION_SIZE_X; - dotProd[i] += (int)in[pos_y + pos_x] * (int)w[j]; -#else - const uint start_pos_y = i * STRIDE_SIZE_Y * FILTER_SIZE_X_PRELOAD; - const uint pos_y = start_pos_y + (j / FILTER_SIZE_X) * DILATION_SIZE_Y * FILTER_SIZE_X_PRELOAD; - const uint pos_x = (j % FILTER_SIZE_X) * DILATION_SIZE_X; - dotProd[i] += (int)in[pos_y + pos_x] * (int)w[j]; -#endif // defined(LOAD_X_WITH_STRIDES) && defined(LOAD_Y_WITH_STRIDES) - } - } - - __attribute__((opencl_unroll_hint(BLOCK_Y_SIZE))) - for (int i = 0; i < BLOCK_Y_SIZE; i++) { - // inputs shift -#ifndef DONT_USE_X_SHIFTS -#if (FILTER_SIZE_X_PRELOAD - STRIDE_SIZE_X) > 0 - __attribute__((opencl_unroll_hint(FILTER_SIZE_X_PRELOAD - STRIDE_SIZE_X))) -#endif - for (int j = 0; j < FILTER_SIZE_X_PRELOAD - STRIDE_SIZE_X; j++) { - in[i * FILTER_SIZE_X_PRELOAD + j] = in[i * FILTER_SIZE_X_PRELOAD + j + STRIDE_SIZE_X]; - } -#endif - - // read additional inputs -#ifdef LOAD_Y_WITH_STRIDES - int input_offset_y = input_y + i * DILATION_SIZE_Y; -#else - int input_offset_y = input_y + i; -#endif // LOAD_Y_WITH_STRIDES - -#if defined(DONT_USE_X_SHIFTS) - __attribute__((opencl_unroll_hint(FILTER_SIZE_X_PRELOAD))) - for (int j = 0; j < FILTER_SIZE_X_PRELOAD; j++) { - int input_offset_x = input_x + ((l + 1) * STRIDE_SIZE_X) + j * DILATION_SIZE_X; - uint input_idx = input_offset + (uint)input_offset_x * INPUT0_X_PITCH + (uint)input_offset_y * INPUT0_Y_PITCH; - in[i * FILTER_SIZE_X_PRELOAD + j] = input[input_idx]; - } - -#else - { - int input_offset_x = input_x + ((l + 1) * STRIDE_SIZE_X) + (FILTER_SIZE_X - 1) * DILATION_SIZE_X; - uint input_idx = input_offset + (uint)input_offset_x * INPUT0_X_PITCH + (uint)input_offset_y * INPUT0_Y_PITCH; - in[i * FILTER_SIZE_X_PRELOAD + FILTER_SIZE_X_PRELOAD - 1] = input[input_idx]; - } -#endif // defined(DONT_USE_X_SHIFTS) - } - - __attribute__((opencl_unroll_hint(SPLIT_Y))) - for (int m = 0; m < SPLIT_Y; m++) { -#if BIAS_TERM - #if BIAS_PER_OUTPUT - #if OUTPUT_LAYOUT_BYXF_AF32 == 1 - const uint bias_index = GET_DATA_INDEX(BIAS, b, of, y + m, x + l); - #elif OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1 - const uint bias_index = GET_DATA_B_FS_YX_FSV4_INDEX(BIAS, b, of, y + m, x + l); - #else - #error "Incorrect output layout" - #endif -#elif BIAS_PER_OFM - const uint bias_index = of; -#endif - // TODO: Maybe half should be supported as well. - float res = (float)dotProd[m] + biases[bias_index]; -#else - float res = (float)dotProd[m]; -#endif - dotProd[m] = 0; - -#if HAS_FUSED_OPS -#if FUSED_OPS_CAN_USE_PRELOAD - FUSED_OPS_CALC; -#else - FUSED_OPS; -#endif - out[m] = FUSED_OPS_RESULT; -#else - out[m] = TO_OUTPUT_TYPE(res); -#endif - } - - __attribute__((opencl_unroll_hint(SPLIT_Y))) - for (int m = 0; m < SPLIT_Y; m++) { -#ifdef SPLIT_LEFTOVERS - if (y + m >= OUTPUT_SIZE_Y) - continue; -#endif - const uint dst_index = OUTPUT_GET_INDEX(b, of, y + m, x + l); - output[dst_index] = ACTIVATION(out[m], ACTIVATION_PARAMS); - } - } // OUT_BLOCK_WIDTH -} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl deleted file mode 100644 index e343f22..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (c) 2019 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/common.cl" - -#include "include/data_types.cl" -#include "include/fetch.cl" -#include "include/mmad.cl" - -#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32) -#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8) -#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32) -#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8) - -__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) -KERNEL(convolution_MMAD)( - __global INPUT0_TYPE* input, - __global OUTPUT_TYPE* output, - __global FILTER_TYPE* weights, -#if BIAS_TERM - __global BIAS_TYPE* biases, -#endif -#if HAS_FUSED_OPS_DECLS - FUSED_OPS_DECLS, -#endif - uint split_idx) -{ - const uint x = get_global_id(0); - const uint y = get_global_id(1); -#if OUTPUT_BATCH_NUM == 1 - const uint f = get_global_id(2); - const uint b = 0; -#else - const uint f = (uint)get_global_id(2) % FILTER_OFM_ALIGNED; - const uint b = (uint)get_global_id(2) / FILTER_OFM_ALIGNED; -#endif - - int dotProd = 0; - - const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; - const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; - - const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM; - - const uint filter_offset = ((uint)get_group_id(2) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH; - const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset; - - for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k) - { - for (uint j = 0; j < FILTER_SIZE_Y ; ++j) - { - const int input_offset_y = input_y + j * DILATION_SIZE_Y; - const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0; - - if(!zero_y) - { - for (uint i = 0; i < FILTER_SIZE_X ; ++i) - { - const int input_offset_x = input_x + i * DILATION_SIZE_X; - const bool zero_x = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0; - - if(!zero_x) - { - uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH + k*32; - uint filter_idx = filter_offset + k*FILTER_Y_PITCH * FILTER_SIZE_Y + j*FILTER_Y_PITCH + i*FILTER_X_PITCH; - - PACKED_TYPE input_data = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx))); - MAKE_VECTOR_TYPE(PACKED_TYPE, 8) activations; //activations of all lanes - activations.s0 = sub_group_broadcast(input_data, 0); - activations.s1 = sub_group_broadcast(input_data, 1); - activations.s2 = sub_group_broadcast(input_data, 2); - activations.s3 = sub_group_broadcast(input_data, 3); - activations.s4 = sub_group_broadcast(input_data, 4); - activations.s5 = sub_group_broadcast(input_data, 5); - activations.s6 = sub_group_broadcast(input_data, 6); - activations.s7 = sub_group_broadcast(input_data, 7); - - int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx))); - - dotProd = MMAD_8(activations, weights_data, dotProd); - } - } - } - } - } - -#if BIAS_TERM -#if BIAS_PER_OUTPUT - const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x); -#elif BIAS_PER_OFM - const uint bias_index = f; -#endif - float res = (float)dotProd + biases[bias_index]; -#else - float res = (float)dotProd; -#endif // BIAS_TERM - -#if HAS_FUSED_OPS - FUSED_OPS; - OUTPUT_TYPE result = FUSED_OPS_RESULT; -#else - OUTPUT_TYPE result = TO_OUTPUT_TYPE(res); -#endif - - const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM; - const uint dst_index = OUTPUT_GET_INDEX(b, f, y, x) + out_split_offset; - output[dst_index] = result; -} - -#undef FILTER_IFM_MMAD_NUM -#undef FILTER_OFM_MMAD_NUM -#undef FILTER_IFM_ALIGNED -#undef FILTER_OFM_ALIGNED diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_blocks.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_blocks.cl deleted file mode 100644 index d5acf18..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_blocks.cl +++ /dev/null @@ -1,158 +0,0 @@ -// Copyright (c) 2016-2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/common.cl" - -#include "include/data_types.cl" -#include "include/fetch.cl" -#include "include/mmad.cl" - -#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32) -#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8) -#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32) -#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8) - -__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) -KERNEL(convolution_MMAD_blocks)( - __global INPUT0_TYPE* input, - __global OUTPUT_TYPE* output, - __global FILTER_TYPE* weights, -#if BIAS_TERM - __global BIAS_TYPE* biases, -#endif -#if HAS_FUSED_OPS_DECLS - FUSED_OPS_DECLS, -#endif - uint split_idx) -{ - const uint x = (uint)get_global_id(0) * OUTPUT_BLOCK_WIDTH; - const uint y = (uint)get_global_id(1) * OUTPUT_BLOCK_HEIGHT; -#if OUTPUT_BATCH_NUM == 1 - const uint f = (uint)get_global_id(2); - const uint b = 0; -#else - const uint f = (uint)get_global_id(2) % FILTER_OFM_ALIGNED; - const uint b = (uint)get_global_id(2) / FILTER_OFM_ALIGNED; -#endif - - int acc[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT] = { 0 }; - PACKED_TYPE in[IN_BLOCK_ARRAY_SIZE]; - - const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; - const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; - - const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM; - - const uint filter_offset = ((uint)get_group_id(2) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH; - const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset; - - uint in_addr = input_offset + input_x * INPUT0_X_PITCH + input_y * INPUT0_Y_PITCH; - uint filter_idx = filter_offset; - - __attribute__((opencl_unroll_hint(1))) - for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k) - { - // preload input data - for(uint in_block_pos = 0; in_block_pos < IN_BLOCK_ARRAY_SIZE; in_block_pos++) - { - uint block_x = in_block_pos % IN_BLOCK_WIDTH; - uint block_y = in_block_pos / IN_BLOCK_WIDTH; - uint input_idx = in_addr + block_x * INPUT0_X_PITCH + block_y * INPUT0_Y_PITCH; - in[in_block_pos] = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx))); - } - // end of preloading input data - - __attribute__((opencl_unroll_hint(FILTER_SIZE_Y))) - for (uint j = 0; j < FILTER_SIZE_Y ; ++j) - { - __attribute__((opencl_unroll_hint(FILTER_SIZE_X))) - for (uint i = 0; i < FILTER_SIZE_X ; ++i) - { - int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx))); - - __attribute__((opencl_unroll_hint(OUTPUT_BLOCK_HEIGHT))) - for(uint br = 0; br < OUTPUT_BLOCK_HEIGHT; br++) - { - __attribute__((opencl_unroll_hint(OUTPUT_BLOCK_WIDTH))) - for(uint bc = 0; bc < OUTPUT_BLOCK_WIDTH; bc++) - { - PACKED_TYPE input_data = in[(br * STRIDE_SIZE_Y + j) * IN_BLOCK_WIDTH + bc * STRIDE_SIZE_X + i]; - MAKE_VECTOR_TYPE(PACKED_TYPE, 8) activations; //activations of all lanes - activations.s0 = sub_group_broadcast(input_data, 0); - activations.s1 = sub_group_broadcast(input_data, 1); - activations.s2 = sub_group_broadcast(input_data, 2); - activations.s3 = sub_group_broadcast(input_data, 3); - activations.s4 = sub_group_broadcast(input_data, 4); - activations.s5 = sub_group_broadcast(input_data, 5); - activations.s6 = sub_group_broadcast(input_data, 6); - activations.s7 = sub_group_broadcast(input_data, 7); - - acc[br * OUTPUT_BLOCK_WIDTH + bc] = MMAD_8(activations, weights_data, acc[br * OUTPUT_BLOCK_WIDTH + bc]); - } - } - filter_idx += 32*8; // 32 features per channel * 8 output features per SIMD channel - } - } - in_addr += 32; // 4 features per channel * 8 SIMD channels - } - -#if BIAS_TERM -#if BIAS_PER_OUTPUT - const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x); -#elif BIAS_PER_OFM - const uint bias_index = f; -#endif -#endif // BIAS_TERM - - OUTPUT_TYPE out[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT] = { 0 }; - for(uint br = 0; br < OUTPUT_BLOCK_HEIGHT; br++) - { - for(uint bc = 0; bc < OUTPUT_BLOCK_WIDTH; bc++) - { -#if BIAS_TERM - // TODO: Maybe half should be supported as well. - float res = (float)acc[br * OUTPUT_BLOCK_WIDTH + bc] + biases[bias_index]; -#else - float res = (float)acc[br * OUTPUT_BLOCK_WIDTH + bc]; -#endif -#if HAS_FUSED_OPS - FUSED_OPS; - out[br * OUTPUT_BLOCK_WIDTH + bc] = FUSED_OPS_RESULT; -#else - out[br * OUTPUT_BLOCK_WIDTH + bc] = TO_OUTPUT_TYPE(res); -#endif - } - } - - const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM; - for(uint br = 0; br < OUTPUT_BLOCK_HEIGHT; br++) - { - if(y + br < OUTPUT_SIZE_Y) - { - for(uint bc = 0; bc < OUTPUT_BLOCK_WIDTH; bc++) - { - if(x + bc < OUTPUT_SIZE_X) - { - const uint dst_index = OUTPUT_GET_INDEX(b, f, y+br, x+bc) + out_split_offset; - output[dst_index] = out[br * OUTPUT_BLOCK_WIDTH + bc]; - } - } - } - } -} - -#undef FILTER_IFM_MMAD_NUM -#undef FILTER_OFM_MMAD_NUM -#undef FILTER_IFM_ALIGNED -#undef FILTER_OFM_ALIGNED diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_af32_imad_1x1.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_af32_imad_1x1.cl deleted file mode 100644 index 78da56d..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_af32_imad_1x1.cl +++ /dev/null @@ -1,163 +0,0 @@ -// Copyright (c) 2019 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/common.cl" - -#include "include/data_types.cl" -#include "include/fetch.cl" -#include "include/imad.cl" - -#if defined(ACCUMULATOR_TYPE) -#undef ACCUMULATOR_TYPE -#endif - -#if QUANTIZATION_TERM -# define ACCUMULATOR_TYPE int -# define ACTIVATION_TYPE float -# define TO_ACTIVATION_TYPE(x) convert_float(x) -#else -# define ACCUMULATOR_TYPE INPUT0_TYPE -# define ACTIVATION_TYPE INPUT0_TYPE -# define TO_ACTIVATION_TYPE(x) TO_INPUT0_TYPE(x) -#endif - - -#define FILTER_IFM_SLICES_NUM ((FILTER_IFM_NUM + 31) / 32) -#define FILTER_OFM_NUM_ALIGNED ((FILTER_OFM_NUM + SUB_GROUP_SIZE - 1) / SUB_GROUP_SIZE * SUB_GROUP_SIZE) - -// we are packing 4 8bit activations per 32 bit -#define PACK 4 - -#define AS_TYPE_N_(type, n, x) as_##type##n(x) -#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x) -#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x) - -__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) -KERNEL(fused_conv_eltwise_gpu_af32_imad_1x1)( - const __global PACKED_TYPE* input, - __global OUTPUT_TYPE* restrict output, - const __global uint* weights, -#if BIAS_TERM - __global BIAS_TYPE* biases, -#endif -#if HAS_FUSED_OPS_DECLS - FUSED_OPS_DECLS, -#endif - uint split_idx) -{ - const uint x = (uint)get_global_id(0) * TILE_LENGTH % OUTPUT_SIZE_X; - const uint y = (uint)get_global_id(0) * TILE_LENGTH / OUTPUT_SIZE_X; - const uint f = (((uint)get_global_id(1) * TILE_DEPTH) % FILTER_OFM_NUM_ALIGNED) / (TILE_DEPTH * SUB_GROUP_SIZE) * (TILE_DEPTH * SUB_GROUP_SIZE); - const uint b = ((uint)get_global_id(1) * TILE_DEPTH) / FILTER_OFM_NUM_ALIGNED; - const uint lid = get_sub_group_local_id(); - - const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; - const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; - - PACKED_TYPE input_slice[TILE_LENGTH]; - int8 weights_slice; - ACCUMULATOR_TYPE accu[TILE_LENGTH][TILE_DEPTH] = {0}; - - uint filter_idx = f * FILTER_IFM_SLICES_NUM * 32 / PACK; - uint in_addr = (INPUT0_GET_INDEX(b, 0, input_y, input_x)) / PACK; - - __attribute__((opencl_unroll_hint(1))) - for (uint k = 0; k < FILTER_IFM_SLICES_NUM; ++k) - { - // Read 32 input features for each pixel in the tile. 4 features in each int, 8 ints across SIMD - __attribute__((opencl_unroll_hint(TILE_LENGTH))) - for (uint i = 0; i < TILE_LENGTH; ++i) - { - uint tmp_addr = in_addr + i * INPUT0_X_PITCH * STRIDE_SIZE_X / PACK; - input_slice[i] = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*)input + tmp_addr)); - } - - // Loop through TILE_DEPTH output features - __attribute__((opencl_unroll_hint(TILE_DEPTH))) - for (uint of = 0; of < TILE_DEPTH; ++of) - { - // Read 32 weights. 8 ints, 4 weights in each int, each SIMD lane has own weghts - weights_slice = as_int8(intel_sub_group_block_read8(weights + filter_idx)); - - __attribute__((opencl_unroll_hint(TILE_LENGTH))) - for (uint i = 0; i < TILE_LENGTH; ++i) - { - PACKED_TYPE A_scalar; - A_scalar = sub_group_broadcast(input_slice[i], 0); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s0)); - A_scalar = sub_group_broadcast(input_slice[i], 1); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s1)); - A_scalar = sub_group_broadcast(input_slice[i], 2); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s2)); - A_scalar = sub_group_broadcast(input_slice[i], 3); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s3)); - A_scalar = sub_group_broadcast(input_slice[i], 4); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s4)); - A_scalar = sub_group_broadcast(input_slice[i], 5); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s5)); - A_scalar = sub_group_broadcast(input_slice[i], 6); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s6)); - A_scalar = sub_group_broadcast(input_slice[i], 7); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s7)); - } - - filter_idx += 32 * 8 / 4; // 32 features per channel * 8 SIMD channels / sizeof(int) - } - in_addr += 4 * 8 / 4; // 4 features per channel * 8 SIMD channels / sizeof(int) -> next 32 input features - } - -#if TILE_DEPTH == 8 - MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result[TILE_LENGTH]; -#elif TILE_DEPTH == 4 - MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result[TILE_LENGTH]; -#endif - - uint dst_index = (OUTPUT_GET_INDEX(b, f, y, x)) / PACK; - - __attribute__((opencl_unroll_hint(TILE_LENGTH))) - for (uint i = 0; i < TILE_LENGTH; ++i) - { - - __attribute__((opencl_unroll_hint(TILE_DEPTH))) - for (uint j = 0; j < TILE_DEPTH; ++j) - { - const uint f2 = f + lid * 4 + (j % 4) + (j / 4 * 32); - ACCUMULATOR_TYPE dotProd = accu[i][j]; -#if BIAS_TERM - #if BIAS_PER_OUTPUT - const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x); - #elif BIAS_PER_OFM - const uint bias_index = f2; - #endif - ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd) + TO_ACTIVATION_TYPE(biases[bias_index]); -#else - ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd); -#endif //BIAS_TERM - - #if HAS_FUSED_OPS - FUSED_OPS; - result[i][j] = FUSED_OPS_RESULT; - #else - result[i][j] = TO_OUTPUT_TYPE(res); - #endif - } - } - - __attribute__((opencl_unroll_hint(TILE_LENGTH))) - for (uint i = 0; i < TILE_LENGTH; ++i) - { -#if TILE_DEPTH == 8 - intel_sub_group_block_write2((__global uint*)output + dst_index + i * OUTPUT_X_PITCH / PACK, as_uint2(result[i])); -#elif TILE_DEPTH == 4 - intel_sub_group_block_write((__global uint*)output + dst_index + i * OUTPUT_X_PITCH / PACK, as_uint(result[i])); -#endif - } -} -#undef FILTER_IFM_SLICES_NUM -#undef FILTER_OFM_NUM_ALIGNED -#undef ACCUMULATOR_TYPE -#undef ACTIVATION_TYPE -#undef TO_ACTIVATION_TYPE diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl index f22ddde..f470f17 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl @@ -129,7 +129,6 @@ KERNEL (fused_convolution_eltwise_gpu_imad)( #else in[reg] = AS_PACKED_TYPE(conv_input[in_addr]);// read SIMD_SIZE elements wide #endif - // TODO This will cause errors for byxf_af32 format on input in_addr += (INPUT0_SIZE_X + IWPAD); // move to next row down #endif } @@ -191,9 +190,7 @@ KERNEL (fused_convolution_eltwise_gpu_imad)( if(!zero_c) #endif { - #if OUTPUT_LAYOUT_BYXF_AF32 == 1 - uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c); - #elif OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1 + #if OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1 uint out_idx = output_idx_offset + r * output_row_size_bytes + (c*PACK); #elif OUTPUT_LAYOUT_B_FS_YX_FSV16 == 1 || OUTPUT_LAYOUT_BS_FS_YX_BSV16_FSV16 == 1 uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl index bd56709..a6af7a2 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl @@ -99,20 +99,6 @@ ((b) / (sub_group_size))*CAT(prefix, _BATCH_PITCH) \ ) -inline uint FUNC(get_bf8_xy16_index)(uint b, uint f, uint y, uint x, uint x_size, uint y_size, uint f_size, uint offset) -{ - const uint xy_idx = x + y * x_size; - const uint xy_offset = (xy_idx % 16) + (xy_idx / 16) * 16 * 8; - const uint xy_block_num = (x_size * y_size + 16 - 1) / 16; - const uint f_offset = (f % 8) * 16 + (f / 8) * xy_block_num * 16 * 8; - const uint f_block_num = (f_size + 8 - 1) / 8; - const uint b_offset = b * f_block_num * xy_block_num * 128; - - const size_t idx = offset + xy_offset + f_offset + b_offset; - - return idx; -} - inline uint FUNC(get_b_fs_yx_fsv_index)(uint b, uint f, uint y, uint x, uint x_size, uint y_size, uint f_size, uint b_size, uint b_pad_before, uint b_pad_after, @@ -495,93 +481,6 @@ inline uint FUNC(get_os_zyxi_osv16_index)(uint o, uint i, uint z, uint y, uint x CAT(prefix, _SIZE_Y), \ CAT(prefix, _SIZE_Z)) -inline uint FUNC(get_byxf_af32_index)(uint b, uint f, uint y, uint x, uint y_pitch, uint b_pitch, uint f_size, uint f_pad_before, uint f_pad_after, uint offset) -{ - const uint f_aligned_to_32 = ((f_size + 31) / 32) * 32; - const uint x_pitch = f_pad_before + f_aligned_to_32 + f_pad_after; - const uint b_offset = b * b_pitch; - const uint xy_offset = x_pitch * x + y_pitch * y; - const uint f_offset = f; - const size_t idx = offset + xy_offset + b_offset + f_offset; - return idx; -} - -#define GET_DATA_BYXF_AF32_INDEX(prefix, b, f, y, x) \ - FUNC_CALL(get_byxf_af32_index)( \ - b, f, y, x, CAT(prefix, _Y_PITCH), \ - CAT(prefix, _BATCH_PITCH), \ - CAT(prefix, _FEATURE_NUM), \ - CAT(prefix, _PAD_BEFORE_FEATURE_NUM), \ - CAT(prefix, _PAD_AFTER_FEATURE_NUM), \ - CAT(prefix, _OFFSET)) - -inline uint FUNC(get_byx8_f4_index)(uint b, uint f, uint y, uint x, - uint x_pitch, uint y_pitch, uint b_pitch, uint f_size, uint x_size, uint offset) -{ - const uint f_aligned_to_4 = ((f_size + 3) / 4) * 4; - const uint x_aligned_to_8 = ((x_size + 7) / 8) * 8; - const uint b_offset = b * b_pitch; - const uint xy_offset = x * x_pitch + y * y_pitch; - const uint f_offset = f; - const size_t idx = offset + xy_offset + b_offset + f_offset; - return idx; -} - -#define GET_DATA_BYX8_F4_INDEX(prefix, b, f, y, x) \ - FUNC_CALL(get_byx8_f4_index)( \ - b, f, y, x, CAT(prefix, _X_PITCH), \ - CAT(prefix, _Y_PITCH), \ - CAT(prefix, _BATCH_PITCH), \ - CAT(prefix, _FEATURE_NUM), \ - CAT(prefix, _SIZE_X), \ - CAT(prefix, _OFFSET)) - -#define GET_DATA_BF8_XY16_INDEX(prefix, b, f, y, x) \ - FUNC_CALL(get_bf8_xy16_index)( \ - b, f, y, x, CAT(prefix, _SIZE_X ), \ - CAT(prefix, _SIZE_Y), \ - CAT(prefix, _FEATURE_NUM), \ - CAT(prefix, _OFFSET)) - -inline uint FUNC(get_fs_bs_yx_bsv4_fsv32_index)(uint b, uint f, uint y, uint x, - uint x_pad_before, uint x_size, uint x_pad_after, - uint y_pad_before, uint y_size, uint y_pad_after, - uint size_f, uint size_b) -{ - const uint f_32_aligned = ((size_f + 31)/32) * 32; - const uint b_4_aligned = ((size_b + 3)/4) * 4; - const uint fsv_idx = f % 32; - const uint bsv_idx = b % 4; - const uint fs_idx = f / 32; - const uint bs_idx = b / 4; - - const uint x_pitch = 32 * 4; - const uint y_pitch = 32 * 4 * (x_pad_before + x_size + x_pad_after); - const uint bs_pitch = y_pitch * (y_pad_before + y_size + y_pad_after); - const uint fs_pitch = bs_pitch * (b_4_aligned / 4); - uint offset = x_pitch * x_pad_before + y_pitch * y_pad_before; - - size_t idx = offset + fsv_idx + bsv_idx * 32; - idx += 32*4 * x; - idx += y * y_pitch; - idx += bs_idx * bs_pitch; - idx += fs_idx * fs_pitch; - - return idx; -} - -#define GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(prefix, b, f, y, x) \ - FUNC_CALL(get_fs_bs_yx_bsv4_fsv32_index)( \ - b, f, y, x, \ - CAT(prefix, _PAD_BEFORE_SIZE_X), \ - CAT(prefix, _SIZE_X), \ - CAT(prefix, _PAD_AFTER_SIZE_X), \ - CAT(prefix, _PAD_BEFORE_SIZE_Y), \ - CAT(prefix, _SIZE_Y), \ - CAT(prefix, _PAD_AFTER_SIZE_Y), \ - CAT(prefix, _FEATURE_NUM), \ - CAT(prefix, _BATCH_NUM)) - #define GET_FILTER_GOIYX(prefix, g, o, i, y, x) \ CAT(prefix, _OFFSET) + \ (x)*CAT(prefix, _X_PITCH) + \ diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl index 08c4bf3..003556b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl @@ -172,7 +172,7 @@ KERNEL(pooling_gpu_b_fs_yx_fsv4)( } #endif -#if OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BYXF_AF32 +#if OUTPUT_LAYOUT_B_FS_YX_FSV4 const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x); #if OUTPUT_FEATURE_NUM % 4 == 0 *((__global OUTPUT_VEC4*)(output + output_pos)) = final_result; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_af32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_af32.cl deleted file mode 100644 index b3829ec..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_af32.cl +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright (c) 2016-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "include/include_all.cl" - -#define ACTIVATION_VEC4 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 4) -#define TO_ACTIVATION_VEC4 CAT(convert_, ACTIVATION_VEC4) - -#define ACCUMULATOR_VEC4 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4) - -#define OUTPUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) -#define TO_OUTPUT_VEC4 CAT(convert_, OUTPUT_VEC4) - -#if MAX_POOLING - #define INIT_VAL ACCUMULATOR_VAL_MIN -#elif AVG_POOLING - #define INIT_VAL ACCUMULATOR_VAL_ZERO -#else - #error -#endif - -inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in) -{ -#if MAX_POOLING - return ACCUMULATOR_MAX_FUNC(tmp, in); -#elif AVG_POOLING - return tmp + in; -#endif -} - -KERNEL(pooling_gpu_byxf_af32)( - const __global INPUT0_TYPE* input, - __global OUTPUT_TYPE* output -#if HAS_FUSED_OPS_DECLS - , FUSED_OPS_DECLS -#endif -) -{ - const uint x = (uint)get_global_id(0); - const uint y = (uint)get_global_id(1); - const uint bf = (uint)get_global_id(2); - // we process 4 features per workitem that's why we need to divide it - const uint aligned32_features = ((INPUT0_FEATURE_NUM + 31) / 32) * 32; - const uint f = 4 * (bf % (aligned32_features / 4)); - const uint b = bf / (aligned32_features / 4); - - typedef MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) input_t; - if (x >= OUTPUT_SIZE_X) - { - return; - } - - const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X; - const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y; - - ACCUMULATOR_VEC4 result = INIT_VAL; - -#ifdef CHECK_BOUNDRY - if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X || - offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y) - { - return; - } - -#ifdef DYNAMIC_KERNEL_DIVIDER - uint num_elementes = 0; -#endif - - const uint batch_and_feature_offset = GET_DATA_INDEX(INPUT0, b, f, 0, 0); - for(uint j = 0; j < POOL_SIZE_Y; j++) - { - int input_offset_y = offset_y + j; - bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0; - if(!zero_y) - { - for(uint i = 0; i < POOL_SIZE_X; i++) - { - int input_offset_x = offset_x + i; - bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0; - if(!zero) - { - const uint input_idx = batch_and_feature_offset + input_offset_y*INPUT0_Y_PITCH + input_offset_x*INPUT0_X_PITCH; - - input_t input_data = AS_INPUT_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx))); - result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0])); - result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1])); - result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2])); - result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3])); - -#ifdef DYNAMIC_KERNEL_DIVIDER - num_elementes++; -#endif - } - } - } - } -#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER - const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y); - const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X); - const uint num_elementes = (hend - offset_y) * (wend - offset_x); -#endif -#else - uint input_idx = GET_DATA_INDEX(INPUT0, b, f, offset_y, offset_x); - - for(uint j = 0; j < POOL_SIZE_Y; j++) - { - for(uint i = 0; i < POOL_SIZE_X; i++) - { - input_t input_data = AS_INPUT_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx))); - result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0])); - result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1])); - result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2])); - result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3])); - - input_idx += INPUT0_X_PITCH; - } - input_idx += (INPUT0_Y_PITCH - POOL_SIZE_X*INPUT0_X_PITCH); - } - -#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) - const uint num_elementes = POOL_SIZE_X*POOL_SIZE_Y; -#endif -#endif - -#if defined AVG_POOLING -#if ENABLE_ROUND - int4 not_fused_result; - for (uint i = 0; i < 4; ++i) { - #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) - not_fused_result[i] = convert_int(round(((float)result[i] / max(num_elementes, (uint)1))); - #else - not_fused_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X))); - #endif - } -#else // ENABLE_ROUND - float4 not_fused_result; - for (uint i = 0; i < 4; ++i) { - #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) - not_fused_result[i] = (float)result[i] / max(num_elementes, (uint)1); - #else - not_fused_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X); - #endif - } -#endif // ENABLE_ROUND -#else // AVG_POOLING - float4 not_fused_result = convert_float4(result); -#endif // AVG_POOLING - - OUTPUT_VEC4 final_result; -#if HAS_FUSED_OPS - ACTIVATION_VEC4 fused_pool_result = TO_ACTIVATION_VEC4(not_fused_result); - FUSED_OPS; - final_result = FUSED_OPS_RESULT; - for(uint op = 0; op < 4; op++) - { - const uint output_pos = GET_DATA_INDEX(OUTPUT, b, f+op, y, x); - output[output_pos] = final_result[op]; - } -#else - final_result = TO_OUTPUT_VEC4(not_fused_result); - for(uint op = 0; op < 4; op++) - { - const uint output_pos = GET_DATA_INDEX(OUTPUT, b, f+op, y, x); - final_result[op] = TO_OUTPUT_TYPE(ACTIVATION(not_fused_result[op], ACTIVATION_PARAMS)); - output[output_pos] = final_result[op]; - } -#endif -} - -#undef INIT_VAL -#undef ACCUMULATOR_VEC4 - -#undef ACTIVATION_VEC4 -#undef TO_ACTIVATION_VEC4 - -#undef OUTPUT_VEC4 -#undef TO_OUTPUT_VEC4 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_int8_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_int8_ref.cl index 244d32f..572b29a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_int8_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_int8_ref.cl @@ -40,7 +40,7 @@ KERNEL(pooling_gpu_int8_ref)( #endif ) { -#if OUTPUT_LAYOUT_BFYX || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_BYXF_AF32 || OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BFZYX +#if OUTPUT_LAYOUT_BFYX || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BFZYX const uint x = (uint)get_global_id(0); const uint yz = (uint)get_global_id(1); #if OUTPUT_DIMS == 5 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl index 3b68dd2..098d3b5 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl @@ -44,7 +44,7 @@ KERNEL(pooling_gpu)( ) { #if OUTPUT_LAYOUT_BFYX || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_BFZYX ||\ - OUTPUT_LAYOUT_B_FS_ZYX_FSV16 || OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16 || OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BYXF_AF32 + OUTPUT_LAYOUT_B_FS_ZYX_FSV16 || OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16 || OUTPUT_LAYOUT_B_FS_YX_FSV4 const uint x = (uint)get_global_id(0); #if OUTPUT_DIMS == 5 const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_biplanar_nv12.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_biplanar_nv12.cl index cb4014e..81c33bb 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_biplanar_nv12.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_biplanar_nv12.cl @@ -27,16 +27,8 @@ inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint #elif defined OUTPUT_LAYOUT_BS_F_BSV8__AF8 || \ defined OUTPUT_LAYOUT_BS_F_BSV16__AF8 return GET_DATA_BS_FYX_BSV8_INDEX(OUTPUT, b, f, y, x, SUB_GROUP_SIZE); -#elif defined OUTPUT_LAYOUT_BF8_XY16 - return GET_DATA_BF8_XY16_INDEX(OUTPUT, b, f, y, x); #elif defined OUTPUT_LAYOUT_B_FS_YX_FSV16 return GET_DATA_B_FS_YX_FSV16_INDEX(OUTPUT, b, f, y, x); -#elif defined OUTPUT_LAYOUT_BYXF_AF32 - return GET_DATA_BYXF_AF32_INDEX(OUTPUT, b, f, y, x); -#elif defined OUTPUT_LAYOUT_BYX8_F4 - return GET_DATA_BYX8_F4_INDEX(OUTPUT, b, f, y, x); -#elif defined OUTPUT_LAYOUT_FS_BS_YX_BSV4_FSV32 - return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x); #elif defined OUTPUT_LAYOUT_B_FS_YX_FSV4 return GET_DATA_B_FS_YX_FSV4_INDEX(OUTPUT, b, f, y, x); #elif defined OUTPUT_LAYOUT_FS_B_YX_FSV32 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_byxf_f32_to_byx8_f4_i8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_byxf_f32_to_byx8_f4_i8.cl deleted file mode 100644 index f385f07..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_byxf_f32_to_byx8_f4_i8.cl +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2016-2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "include/reshape_dims.cl" -#include "include/fetch.cl" - -#include "include/data_types.cl" - -///////////////////////// Input Index ///////////////////////// -inline uint FUNC(get_input_index)(uint b, uint f, uint y, uint x) -{ -#if INPUT0_SIMPLE - return GET_DATA_INDEX(INPUT0, b, f, y, x); -#elif defined INPUT0_LAYOUT_BS_F_BSV8__AF8 || \ - defined INPUT0_LAYOUT_BS_F_BSV16__AF8 - return GET_DATA_BS_FYX_BSV8_INDEX(INPUT0, b, f, y, x, SUB_GROUP_SIZE); -#elif defined INPUT0_LAYOUT_BF8_XY16 - return GET_DATA_BF8_XY16_INDEX(INPUT0, b, f, y, x); -#elif defined INPUT0_LAYOUT_BYXF_AF32 - return GET_DATA_BYXF_AF32_INDEX(INPUT0, b, f, y, x); -#elif defined INPUT0_LAYOUT_BYX8_F4 - return GET_DATA_BYX8_F4_INDEX(INPUT0, b, f, y, x); -#elif defined INPUT0_LAYOUT_FS_BS_YX_BSV4_FSV32 - return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b, f, y, x); -#elif defined INPUT0_LAYOUT_B_FS_YX_FSV4 - return GET_DATA_B_FS_YX_FSV4_INDEX(INPUT0, b, f, y, x); -#else -#error reorder_data.cl: input format - not supported -#endif -} - -///////////////////////// Output Index ///////////////////////// - -inline uint FUNC(get_output_index)(uint b, uint f, uint y, uint x) -{ -#if OUTPUT_SIMPLE - return GET_DATA_INDEX(OUTPUT, b, f, y, x); -#elif defined OUTPUT_LAYOUT_BS_F_BSV8__AF8 || \ - defined OUTPUT_LAYOUT_BS_F_BSV16__AF8 - return GET_DATA_BS_FYX_BSV8_INDEX(OUTPUT, b, f, y, x, SUB_GROUP_SIZE); -#elif defined OUTPUT_LAYOUT_BF8_XY16 - return GET_DATA_BF8_XY16_INDEX(OUTPUT, b, f, y, x); -#elif defined OUTPUT_LAYOUT_BYXF_AF32 - return GET_DATA_BYXF_AF32_INDEX(OUTPUT, b, f, y, x); -#elif defined OUTPUT_LAYOUT_BYX8_F4 - return GET_DATA_BYX8_F4_INDEX(OUTPUT, b, f, y, x); -#elif defined OUTPUT_LAYOUT_FS_BS_YX_BSV4_FSV32 - return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x); -#elif defined OUTPUT_LAYOUT_B_FS_YX_FSV4 - return GET_DATA_B_FS_YX_FSV4_INDEX(OUTPUT, b, f, y, x); -#else -#error reorder_data.cl: output format - not supported -#endif -} - -__attribute__((intel_reqd_sub_group_size(16))) -KERNEL (reorder_data_byxf_f32_to_byx8_f4_i8)( - const __global INPUT_REORDER_TYPE* input, - __global OUTPUT_REORDER_TYPE* output -#ifdef MEAN_SUBTRACT_IN_BUFFER - , __global MEAN_SUBTRACT_TYPE* mean_subtract -#endif - ) -{ - const uint x = get_global_id(0); - const uint y = get_group_id(1); - const uint b = (uint)get_group_id(2) * WG_BATCH_SIZE + (uint)get_sub_group_id(); - - const uint input_idx = FUNC_CALL(get_input_index)(b, 0, y, x); - const uint output_idx = FUNC_CALL(get_output_index)(b, 0, y, x); - -#if defined MEAN_SUBTRACT_INSIDE_PARAMS - float4 res; - res.s0 = TO_MEAN_TYPE(input[input_idx]); - res.s0 = MEAN_OP(res.s0, VALUE_TO_SUBTRACT[0 % VALUE_TO_SUBTRACT_SIZE]); - res.s1 = TO_MEAN_TYPE(input[input_idx+1]); - res.s1 = MEAN_OP(res.s1, VALUE_TO_SUBTRACT[1 % VALUE_TO_SUBTRACT_SIZE]); - res.s2 = TO_MEAN_TYPE(input[input_idx+2]); - res.s2 = MEAN_OP(res.s2, VALUE_TO_SUBTRACT[2 % VALUE_TO_SUBTRACT_SIZE]); - res.s3 = 0; -#elif defined MEAN_SUBTRACT_IN_BUFFER -#if defined MEAN_PER_FEATURE - MAKE_VECTOR_TYPE(MEAN_SUBTRACT_TYPE, 4) res; - res.s0 = TO_MEAN_TYPE(input[input_idx]); - res.s0 = MEAN_OP(res.s0, mean_subtract[0]); - res.s1 = TO_MEAN_TYPE(input[input_idx+1]); - res.s1 = MEAN_OP(res.s1, mean_subtract[1]); - res.s2 = TO_MEAN_TYPE(input[input_idx+2]); - res.s2 = MEAN_OP(res.s2, mean_subtract[2]); - res.s3 = 0 -#else - MAKE_VECTOR_TYPE(MEAN_SUBTRACT_TYPE, 4) res; - res.s0 = TO_MEAN_TYPE(input[input_idx]); - res.s1 = TO_MEAN_TYPE(input[input_idx+1]); - res.s2 = TO_MEAN_TYPE(input[input_idx+2]); - res.s3 = 0; - - res.s0 = MEAN_OP(res.s0, mean_subtract[0]); - res.s1 = MEAN_OP(res.s1, mean_subtract[1]); - res.s2 = MEAN_OP(res.s2, mean_subtract[2]); -#endif -#else - MAKE_VECTOR_TYPE(CALC_TYPE, 4) res; - res.s0 = TO_CALC_TYPE(input[input_idx]); - res.s1 = TO_CALC_TYPE(input[input_idx+1]); - res.s2 = TO_CALC_TYPE(input[input_idx+2]); - res.s3 = 0; -#endif - - char4 out_vals; - out_vals.s0 = ACTIVATION_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE_SAT(res.s0), ACTIVATION_PARAMS_TYPED); - out_vals.s1 = ACTIVATION_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE_SAT(res.s1), ACTIVATION_PARAMS_TYPED); - out_vals.s2 = ACTIVATION_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE_SAT(res.s2), ACTIVATION_PARAMS_TYPED); - out_vals.s3 = 0; - - __global uint* dst = (__global uint*)output; - dst[output_idx/4] = as_uint(out_vals); -} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_fast_b1.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_fast_b1.cl index 8c1360a..45c1ff6 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_fast_b1.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_fast_b1.cl @@ -30,8 +30,6 @@ inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x #elif defined INPUT0_LAYOUT_BS_F_BSV8__AF8 || \ defined INPUT0_LAYOUT_BS_F_BSV16__AF8 return GET_DATA_BS_FYX_BSV8_INDEX(INPUT0, b, f, y, x, SUB_GROUP_SIZE); -#elif defined INPUT0_LAYOUT_BF8_XY16 - return GET_DATA_BF8_XY16_INDEX(INPUT0, b, f, y, x); #elif defined INPUT0_LAYOUT_B_FS_YX_FSV16 return GET_DATA_B_FS_YX_FSV16_INDEX(INPUT0, b, f, y, x); #elif defined INPUT0_LAYOUT_B_FS_ZYX_FSV16 @@ -54,8 +52,6 @@ inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint #elif defined OUTPUT_LAYOUT_BS_F_BSV8__AF8 || \ defined OUTPUT_LAYOUT_BS_F_BSV16__AF8 return GET_DATA_BS_FYX_BSV8_INDEX(OUTPUT, b, f, y, x, SUB_GROUP_SIZE); -#elif defined OUTPUT_LAYOUT_BF8_XY16 - return GET_DATA_BF8_XY16_INDEX(OUTPUT, b, f, y, x); #elif defined OUTPUT_LAYOUT_B_FS_YX_FSV16 return GET_DATA_B_FS_YX_FSV16_INDEX(OUTPUT, b, f, y, x); #elif defined OUTPUT_LAYOUT_B_FS_ZYX_FSV16 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_to_yxfb_batched.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_to_yxfb_batched.cl index 7ed82f3..f6014c6 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_to_yxfb_batched.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_to_yxfb_batched.cl @@ -26,10 +26,6 @@ inline uint FUNC(get_input_index)(uint b, uint f, uint y, uint x) #elif defined INPUT0_LAYOUT_BS_F_BSV8__AF8 || \ defined INPUT0_LAYOUT_BS_F_BSV16__AF8 return GET_DATA_BS_FYX_BSV8_INDEX(INPUT0, b, f, y, x, SUB_GROUP_SIZE); -#elif defined INPUT0_LAYOUT_BF8_XY16 - return GET_DATA_BF8_XY16_INDEX(INPUT0, b, f, y, x); -#elif defined INPUT0_LAYOUT_BYXF_AF32 - return GET_DATA_BYXF_AF32_INDEX(INPUT0, b, f, y, x); #elif defined INPUT0_LAYOUT_B_FS_YX_FSV16 return GET_DATA_B_FS_YX_FSV16_INDEX(INPUT0, b, f, y, x); #elif defined INPUT0_LAYOUT_FS_B_YX_FSV32 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp index e0d7048..4b5176c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp @@ -330,8 +330,6 @@ JitDefinitions DataTensorJitConstant::GetDefinitions() const { raw_index_func_val = "GET_DATA_INDEX_RAW(" + _name + ", b, f, y, x)"; } else if (layout == DataLayout::b_fs_yx_fsv16 || layout == DataLayout::b_fs_yx_fsv32 || - layout == DataLayout::byxf_af32 || - layout == DataLayout::fs_bs_yx_bsv4_fsv32 || layout == DataLayout::b_fs_yx_fsv4 || layout == DataLayout::fs_b_yx_fsv32 || layout == DataLayout::bs_fs_yx_bsv16_fsv16) { diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp index b175572..d8aa3c5 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp @@ -103,11 +103,7 @@ std::string toString(DataLayout l) { case kernel_selector::DataLayout::b_fs_zyx_fsv32: return "B_FS_ZYX_FSV32"; case kernel_selector::DataLayout::bs_f_bsv8__af8: return "BS_F_BSV8__AF8"; case kernel_selector::DataLayout::bs_f_bsv16__af8: return "BS_F_BSV16__AF8"; - case kernel_selector::DataLayout::bf8_xy16: return "BF8_XY16"; case kernel_selector::DataLayout::winograd_2x3_s1_data: return "WINOGRAD_2x3_S1_DATA"; - case kernel_selector::DataLayout::byxf_af32: return "BYXF_AF32"; - case kernel_selector::DataLayout::byx8_f4: return "BYX8_F4"; - case kernel_selector::DataLayout::fs_bs_yx_bsv4_fsv32: return "FS_BS_YX_BSV4_FSV32"; case kernel_selector::DataLayout::b_fs_yx_fsv4: return "B_FS_YX_FSV4"; case kernel_selector::DataLayout::b_fs_yx_32fp: return "B_FS_YX_32FP"; case kernel_selector::DataLayout::bfzyx: return "BFZYX"; diff --git a/inference-engine/thirdparty/clDNN/src/convolution.cpp b/inference-engine/thirdparty/clDNN/src/convolution.cpp index c382842..03e9227 100644 --- a/inference-engine/thirdparty/clDNN/src/convolution.cpp +++ b/inference-engine/thirdparty/clDNN/src/convolution.cpp @@ -272,40 +272,6 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) { return {output_type, format::b_fs_yx_32fp, output_size}; } - // due to performance reason for using fs_bs_yx_bsv4_fsv32 first convolution have 3 features, so first conv layer - // will take byxf and return fs_bs_yx_bsv4_fsv32 - if (input_layout.data_type == data_types::i8 && input_layout.format == format::byx8_f4 && - input_layout.size.batch[0] % 4 == 0 && input_layout.size.feature[0] == 3) { - return layout{output_type, cldnn::format::fs_bs_yx_bsv4_fsv32, output_size}; - } - - auto users = node.get_users(); - if (users.size() == 1 && users.front()->is_type()) { - auto conv_split = users.front()->as().get_split(); - auto conv_groups = (int32_t)users.front()->as().get_groups(); - - bool next_is_dw = ((conv_split > 1 && conv_split == output_size.feature[0]) || - (conv_groups > 1 && conv_groups == output_size.feature[0])); - - if (input_layout.data_type == data_types::i8 && input_layout.format == format::b_fs_yx_fsv4 && next_is_dw) { - return layout{output_type, cldnn::format::byxf_af32, output_size}; - } - - auto prev_node = node.get_dependencies().front(); - if (prev_node->is_type()) - prev_node = prev_node->get_dependencies().front(); - - auto prev_is_convo = prev_node->is_type(); - if (prev_is_convo) { - auto prev2_node = prev_node->get_dependencies().front(); - auto prev_input_format = prev2_node->get_output_layout().format; - - if (input_layout.data_type == data_types::i8 && input_layout.format == format::byxf_af32 && !next_is_dw && - prev_input_format == format::b_fs_yx_fsv4) { - return layout{output_type, cldnn::format::b_fs_yx_fsv4, output_size}; - } - } - } return {output_type, input_layout.format, output_size}; } diff --git a/inference-engine/thirdparty/clDNN/src/fused_conv_eltwise.cpp b/inference-engine/thirdparty/clDNN/src/fused_conv_eltwise.cpp index a000e9a..5d8de98 100644 --- a/inference-engine/thirdparty/clDNN/src/fused_conv_eltwise.cpp +++ b/inference-engine/thirdparty/clDNN/src/fused_conv_eltwise.cpp @@ -230,13 +230,6 @@ layout fused_conv_eltwise_inst::calc_output_layout(fused_conv_eltwise_node const output_range.spatial[1], output_range.spatial[2]); - // due to performance reason for using fs_bs_yx_bsv4_fsv32 first convolution have 3 features, so first conv layer - // will take byxf and return fs_bs_yx_bsv4_fsv32 - if (input_layout.data_type == data_types::i8 && input_layout.format == format::byx8_f4 && - input_layout.size.batch[0] % 4 == 0 && input_layout.size.feature[0] == 3) { - return layout{output_type, cldnn::format::fs_bs_yx_bsv4_fsv32, output_size}; - } - return {output_type, input_layout.format, output_size}; } diff --git a/inference-engine/thirdparty/clDNN/src/gpu/concatenation_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/concatenation_gpu.cpp index dedf1b3..3d4cfc2 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/concatenation_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/concatenation_gpu.cpp @@ -150,8 +150,6 @@ attach_concatenation_gpu::attach_concatenation_gpu() { {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), concatenation_gpu::create}, {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), concatenation_gpu::create}, // MMAD - {std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), concatenation_gpu::create}, - {std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), concatenation_gpu::create}, {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), concatenation_gpu::create}, {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), concatenation_gpu::create}, {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv32), concatenation_gpu::create}, diff --git a/inference-engine/thirdparty/clDNN/src/gpu/convolution_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/convolution_gpu.cpp index 6f7ce89..dcba536 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/convolution_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/convolution_gpu.cpp @@ -189,8 +189,6 @@ attach_convolution_gpu::attach_convolution_gpu() { implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfzyx), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::winograd_2x3_s1_data), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::winograd_2x3_s1_data), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bf8_xy16), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bf8_xy16), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw); // block f16 format @@ -202,11 +200,6 @@ attach_convolution_gpu::attach_convolution_gpu() { implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv16), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv16), val_fw); // MMAD - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byx8_f4), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), val_fw); @@ -216,7 +209,6 @@ attach_convolution_gpu::attach_convolution_gpu() { implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv32), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv32), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), val_fw); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/definitions.cl b/inference-engine/thirdparty/clDNN/src/gpu/definitions.cl deleted file mode 100644 index 9f719e6..0000000 --- a/inference-engine/thirdparty/clDNN/src/gpu/definitions.cl +++ /dev/null @@ -1,192 +0,0 @@ -/* -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#ifdef CODE_PREFIX -#define CODE_BEGIN CODE_PREFIX -#define CODE_END CODE_POSTFIX -#else -#define CODE_BEGIN -#define CODE_END -#endif - -CODE_BEGIN -enum neural_memory_format { - x_f32, - xb_f32, // 1D+batch, float32 - bx_f32, // 1D+batch, float32 - yxfb_f32, // 3D+batch, float32 - byxf_f32, // for convolution_cpu_jit_batch1 - bfyx_f32, // used in Caffe - fyxb_f32, // used in Caffe - oiyx_f32, // format used only for weights: o - output feature maps, i - input feature maps - byxf_b24_f32, // for convolution_cpu_generic - yxoi_o4_f32, // for convolution_cpu_generic - os_yxi_sv16_f32, // format used only for weights: os - output slice, i - input feature maps, sv16 - 16 values of single slice - bs_yxf_bv24_f32, - any=-1 -}; - -#pragma pack(push, 4) -typedef struct _neural_memory_tag { - uint format; - uint feature_offset; - uint spatial_offset; - uint vector_size; - uint data_offset; - uint data[1]; -} neural_memory; - -typedef struct _neural_vector_tag { - uint feature_offset; - uint spatial_offset; - uint raw_size; - uint data[1]; -} neural_vector; -#pragma pack(pop) - -// neural_memory accessors -__attribute__((overloadable)) __global uint* get_raw(__global neural_memory* mem) { return &(mem->data[0]); } -__attribute__((overloadable)) const __global uint* get_raw(const __global neural_memory* mem) { return &(mem->data[0]); } -__attribute__((overloadable)) uint get_raw_size(const __global neural_memory* mem) { return mem->vector_size; } - -__attribute__((overloadable)) __global uint* get_batch(__global neural_memory* mem) { return get_raw(mem); } -__attribute__((overloadable)) const __global uint* get_batch(const __global neural_memory* mem) { return get_raw(mem); } -__attribute__((overloadable)) uint get_batch_size(const __global neural_memory* mem) { return mem->feature_offset; } - -__attribute__((overloadable)) __global uint* get_feature(__global neural_memory* mem) { return &(mem->data[mem->feature_offset]); } -__attribute__((overloadable)) const __global uint* get_feature(const __global neural_memory* mem) { return &(mem->data[mem->feature_offset]); } -__attribute__((overloadable)) uint get_feature_size(const __global neural_memory* mem) { return mem->spatial_offset - mem->feature_offset; } - -__attribute__((overloadable)) __global uint* get_spatial(__global neural_memory* mem) { return &(mem->data[mem->spatial_offset]); } -__attribute__((overloadable)) const __global uint* get_spatial(const __global neural_memory* mem) { return &(mem->data[mem->spatial_offset]); } -__attribute__((overloadable)) uint get_spatial_size(const __global neural_memory* mem) { return get_raw_size(mem) - mem->spatial_offset; } - -__attribute__((overloadable)) __global void* get_data(__global neural_memory* mem) { return &(mem->data[mem->data_offset]); } -__attribute__((overloadable)) const __global void* get_data(const __global neural_memory* mem) { return &(mem->data[mem->data_offset]); } -__attribute__((overloadable)) size_t get_element_size(const __global neural_memory* mem) { return sizeof(float); } - -__attribute__((overloadable)) size_t get_data_size(const __global neural_memory* mem) { - size_t result = get_element_size(mem); - - const __global uint* raw = get_raw(mem); - uint raw_size = get_raw_size(mem); - - for(uint i = 0; i < raw_size; i++) { - result *= raw[i]; - } - return result; -} - -// neural_vector accessors -// TODO NOTE: non-const accessors are disabled now, because read-only neural_vector argument is only supported now - -//__attribute__((overloadable)) __global uint* get_raw(__global neural_vector* v) { return &(v->data[0]); } -__attribute__((overloadable)) const __global uint* get_raw(const __global neural_vector* v) { return &(v->data[0]); } -__attribute__((overloadable)) uint get_raw_size(const __global neural_vector* v) { return v->raw_size; } - -//__attribute__((overloadable)) __global uint* get_batch(__global neural_vector* v) { return get_raw(v); } -__attribute__((overloadable)) const __global uint* get_batch(const __global neural_vector* v) { return get_raw(v); } -__attribute__((overloadable)) uint get_batch_size(const __global neural_vector* v) { return v->feature_offset; } - -//__attribute__((overloadable)) __global uint* get_feature(__global neural_vector* v) { return &(v->data[v->feature_offset]); } -__attribute__((overloadable)) const __global uint* get_feature(const __global neural_vector* v) { return &(v->data[v->feature_offset]); } -__attribute__((overloadable)) uint get_feature_size(const __global neural_vector* v) { return v->spatial_offset - v->feature_offset; } - -//__attribute__((overloadable)) __global uint* get_spatial(__global neural_vector* v) { return &(v->data[v->spatial_offset]); } -__attribute__((overloadable)) const __global uint* get_spatial(const __global neural_vector* v) { return &(v->data[v->spatial_offset]); } -__attribute__((overloadable)) uint get_spatial_size(const __global neural_vector* v) { return get_raw_size(v) - v->spatial_offset; } - -CODE_END - -/* -KERNEL(Fully_Connected_GPU) -DECALRE_CONSTANT() -BEGIN_ARGUMENTS_DECLARATION -DECLARE_INPUT_MEMORY_ARGUMENT(input_mem) -DECLARE_INPUT_MEMORY_ARGUMENT(weights_mem) -DECLARE_INPUT_MEMORY_ARGUMENT(bias_mem) -DECLARE_OUTPUT_MEMORY_ARGUMENT(dst_mem) -END_ARGUMENTS_DECLARATION -CODE_BEGIN -#define WEIGHTS { 1.0, 3.2, 4.5, 6.7 } -#define WEIGHTS_SIZE { 2, 2 } -#define WEIGHTS_DIM 2 -*/ -__kernel void Fully_Connected_GPU(__global neural_memory* input_mem, __global neural_memory* weights_mem, __global neural_memory* bias_mem, __global neural_memory* dst_mem) -{ - __global uint* input_size = get_raw(input_mem); - __global uint* weights_size = get_raw(weights_mem); - __global float* input = (__global float*)get_data(input_mem); - __global float* weights = (__global float*)get_data(weights_mem); - __global float* bias = (__global float*)get_data(bias_mem); - __global float* pDst = (__global float*)get_data(dst_mem); - - const int x = get_global_id(0); - - pDst[x] = 0; - uint outXIdx = x / input_size[0]; - uint inputBatchIdx = x % input_size[0]; - uint weightYIdx = outXIdx * weights_size[0]; - for (uint i = 0; i < input_size[2]; i++) - { - pDst[x] += input[i * input_size[0] + inputBatchIdx] * weights[weightYIdx + i]; - } - pDst[x] += bias[outXIdx]; -} -CODE_END - -CODE_BEGIN -__kernel void Convolution_GPU( - const __global neural_memory* input_mem, - const __global neural_memory* filter_mem, - float bias, - __global neural_memory* dst_mem, - const __global neural_vector* spatial_stride) -{ - -// - const __global uint* input_size = get_raw(input_mem); - const __global uint* filter_size = get_raw(filter_mem); - const __global uint* dst_size = get_raw(dst_mem); - const __global float* input = (const __global float*)get_data(input_mem); - const __global float* filter = (const __global float*)get_data(filter_mem); - __global float* pDst = (__global float*)get_data(dst_mem); -// - - int global_id = get_global_id(0); - const int batch_num = dst_size[0]; - const int batch_offset = global_id % dst_size[0]; - - const int idx = global_id / batch_num; - - const int x = (idx % input_size[2]) * get_spatial(spatial_stride)[0]; - const int y = (idx * get_spatial(spatial_stride)[1]) / input_size[2]; - - const int out_offset = idx * batch_num + batch_offset; - - pDst[out_offset] = 0; - for (uint i = 0; i < filter_size[4]; i++) - { - for (uint j = 0; j < filter_size[3]; j++) - { - int input_idx = (x + j + ((y + i) * input_size[2])) * batch_num + batch_offset; - int filter_idx = i * filter_size[3] + j; - pDst[out_offset] += input[input_idx] * filter[filter_idx]; - } - } - pDst[out_offset] += bias; -} -CODE_END diff --git a/inference-engine/thirdparty/clDNN/src/gpu/eltwise_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/eltwise_gpu.cpp index 1b44664..42bd969 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/eltwise_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/eltwise_gpu.cpp @@ -182,9 +182,6 @@ attach_eltwise_gpu::attach_eltwise_gpu() { { std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), eltwise_gpu::create }, { std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_yx_bsv16_fsv16), eltwise_gpu::create }, // MMAD - { std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), eltwise_gpu::create }, - { std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), eltwise_gpu::create }, - { std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), eltwise_gpu::create }, { std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), eltwise_gpu::create }, { std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), eltwise_gpu::create }, { std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), eltwise_gpu::create }, diff --git a/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_gpu.cpp index 0afe4b3..069501d 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_gpu.cpp @@ -100,8 +100,6 @@ attach_fully_connected_gpu::attach_fully_connected_gpu() { {std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw}, {std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw}, // MMAD - {std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw}, - {std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), val_fw}, {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv32), val_fw}, {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv32), val_fw}, // IMAD diff --git a/inference-engine/thirdparty/clDNN/src/gpu/fused_conv_eltwise_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/fused_conv_eltwise_gpu.cpp index c652514..1e552b9 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/fused_conv_eltwise_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/fused_conv_eltwise_gpu.cpp @@ -183,23 +183,13 @@ attach_fused_conv_eltwise_gpu::attach_fused_conv_eltwise_gpu() { fused_conv_eltwise_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16), fused_conv_eltwise_gpu::create); - // MMAD - implementation_map::add( - std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), - fused_conv_eltwise_gpu::create); // IMAD - implementation_map::add( - std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), - fused_conv_eltwise_gpu::create); - implementation_map::add( - std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), - fused_conv_eltwise_gpu::create); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), fused_conv_eltwise_gpu::create); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), fused_conv_eltwise_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::image_2d_rgba), - fused_conv_eltwise_gpu::create); + fused_conv_eltwise_gpu::create); } } // namespace detail diff --git a/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp index 86a8322..0f38da3 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp @@ -206,9 +206,6 @@ attach_pooling_gpu::attach_pooling_gpu() { implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), pooling_gpu::create); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), pooling_gpu::create); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv4), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), pooling_gpu::create); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/quantize_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/quantize_gpu.cpp index 9445834..c9eddf7 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/quantize_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/quantize_gpu.cpp @@ -113,11 +113,6 @@ attach_quantize_gpu::attach_quantize_gpu() { implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf), val_fw); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/resample_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/resample_gpu.cpp index 6b27364..7409bd4 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/resample_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/resample_gpu.cpp @@ -99,11 +99,7 @@ attach_resample_gpu::attach_resample_gpu() { {std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), resample_gpu::create}, {std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv4), resample_gpu::create}, {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), resample_gpu::create}, - {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), resample_gpu::create}, - {std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), resample_gpu::create}, - {std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), resample_gpu::create}, - {std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), resample_gpu::create}, - {std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), resample_gpu::create}}); + {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), resample_gpu::create}}); } } // namespace detail diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp index 81db4d5..8b46567 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp @@ -127,10 +127,6 @@ bool concat_in_place_optimization::match(concatenation_node& node) { (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f)) return false; - // TODO: If we replace byxf_af32 with byxf we can probably do this optimization, but support in kernels is required - if (l.format == format::byxf_af32 && (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f)) - return false; - if (l.format == format::bs_fs_yx_bsv16_fsv16) return false; diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_padding.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_padding.cpp index 092b931..8c1f62d 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_padding.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_padding.cpp @@ -121,12 +121,9 @@ void prepare_padding::run(program_impl& p) { // right now output padding optimization is only available for bfyx format and data type = float32 if (conv_layout.format != cldnn::format::bfyx && - conv_layout.format != cldnn::format::bf8_xy16 && conv_layout.format != cldnn::format::b_fs_yx_fsv16 && conv_layout.format != cldnn::format::b_fs_zyx_fsv16 && conv_layout.format != cldnn::format::bs_fs_yx_bsv16_fsv16 && - conv_layout.format != cldnn::format::byxf_af32 && - conv_layout.format != cldnn::format::fs_bs_yx_bsv4_fsv32 && conv_layout.format != cldnn::format::b_fs_yx_fsv4 && conv_layout.format != cldnn::format::fs_b_yx_fsv32 && conv_layout.format != cldnn::format::b_fs_yx_32fp) { diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp index 4e235e9..6b2e6bd 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp @@ -760,11 +760,8 @@ void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node* for (auto& dep : eltw_node->get_dependencies()) { format fmt = dep->get_output_layout().format; data_types dep_dt = dep->get_output_layout().data_type; - if ((fmt != format::fs_bs_yx_bsv4_fsv32 || dep_dt != data_types::i8) && - (fmt != format::b_fs_yx_fsv4 || dep_dt != data_types::i8) && + if ((fmt != format::b_fs_yx_fsv4 || dep_dt != data_types::i8) && (fmt != format::b_fs_yx_fsv4 || dep_dt != data_types::u8) && - (fmt != format::byxf_af32 || dep_dt != data_types::i8) && - (fmt != format::byxf_af32 || dep_dt != data_types::u8) && (fmt != format::bfyx || dep_dt != data_types::f32) && (fmt != format::bfyx || dep_dt != data_types::u8) && (fmt != format::bfyx || dep_dt != data_types::i8) && (fmt != format::yxfb || dep_dt != data_types::f16) && (fmt != format::bfyx || dep_dt != data_types::f16 || !if_already_depth_to_space_fused)) diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/reorder_inputs.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/reorder_inputs.cpp index ad04629..c3fbc04 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/reorder_inputs.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/reorder_inputs.cpp @@ -372,70 +372,6 @@ void reorder_inputs::run(program_impl& p, layout_optimizer& lo, reorder_factory& auto fmt_map = get_preferred_formats(p, lo); propagate_formats(p, fmt_map, lo); minimize_local_reorders(p, fmt_map, lo); - - // WA START ============================================================================================================ - if (lo.get_optimization_attributes().b_fs_yx_fsv16_network) { - // This is a temprorary work-around for known bad case until byxf_af32 handling will be corrected in layout_optimizer. - // - // Find pattern: - // mvn(int8, b_fs_yx_fsv16, [x,16,1280,720]) -> conv(int8, byxf_af32, [x,3,1280,720]) -> mvn(*, bfyx) -> - // Replace with: - // mvn(b_fs_yx_fsv16) -> conv(b_fs_yx_fsv16) -> mvn(b_fs_yx_fsv16) -> - // - // Generally for such convolution b_fs_yx_fsv16 will always perform better than byxf_af32, - // but to avoid unvalidated int8 b_fs_yx_fsv16 networks and potential regressions this WA is needed. - // Additionally reorder from af32 -> bfyx will take ~9 times longer than actual convolution. - for (auto& node_ptr : p.get_processing_order()) { - if (!node_ptr->is_in_data_flow() || !node_ptr->is_type() || fmt_map.at(node_ptr) != format::byxf_af32) - continue; - - auto& conv_node = node_ptr->as(); - - bool input_path = - conv_node.input().get_output_layout().data_type == data_types::i8 && - conv_node.input().is_type() && - fmt_map.at(&conv_node.input()) == format::b_fs_yx_fsv16; - bool output_path = - conv_node.get_users().size() == 1 && - conv_node.get_users().front()->is_type() && - fmt_map.at(conv_node.get_users().front()) == format::bfyx && - conv_node.get_users().front()->get_users().size() == 1 && - !conv_node.get_users().front()->as().get_primitive()->across_channels; - - if (!input_path || !output_path) - continue; - - auto in_lay = conv_node.input().get_output_layout(); - auto out_lay = conv_node.get_output_layout(); - auto wei_lay = conv_node.weights().get_output_layout(); - bool correct_layouts = - // weights - wei_lay.data_type == data_types::i8 && - wei_lay.size.spatial[0] == 3 && wei_lay.size.spatial[1] == 3 && - // input/output - in_lay.data_type == data_types::i8 && out_lay.data_type == data_types::i8 && - in_lay.size.feature[0] == 16 && out_lay.size.feature[0] == 3 && - in_lay.size.spatial[0] == 1280 && out_lay.size.spatial[0] == 1280 && - in_lay.size.spatial[1] == 720 && out_lay.size.spatial[1] == 720; - - if (!correct_layouts) - continue; - - bool correct_conv = - conv_node.get_groups() == 1 && conv_node.get_split() == 1 && conv_node.get_deformable_groups() == 1 && - !conv_node.get_depthwise_sep_opt() && !conv_node.get_transposed() && - !conv_node.activations_zero_points_term() && !conv_node.weights_zero_points_term() && !conv_node.compensation_term() && - conv_node.get_primitive()->dilation == tensor(1); - - if (!correct_conv) - continue; - - fmt_map.at(node_ptr) = format::b_fs_yx_fsv16; - fmt_map.at(conv_node.get_users().front()) = format::b_fs_yx_fsv16; - } - } - // WA END ============================================================================================================== - insert_reorders(p, fmt_map, rf); for (auto n : p.get_processing_order()) { diff --git a/inference-engine/thirdparty/clDNN/src/include/to_string_utils.h b/inference-engine/thirdparty/clDNN/src/include/to_string_utils.h index b52fbd3..561fae4 100644 --- a/inference-engine/thirdparty/clDNN/src/include/to_string_utils.h +++ b/inference-engine/thirdparty/clDNN/src/include/to_string_utils.h @@ -85,16 +85,8 @@ inline std::string fmt_to_str(format fmt) { return "bs_xs_xsv8_bsv16"; case format::bs_x_bsv16: return "bs_x_bsv16"; - case format::bf8_xy16: - return "bf8_xy16"; case format::winograd_2x3_s1_data: return "winograd_2x3_s1_data"; - case format::byxf_af32: - return "byxf_af32"; - case format::byx8_f4: - return "byx8_f4"; - case format::fs_bs_yx_bsv4_fsv32: - return "fs_bs_yx_bsv4_fsv32"; case format::b_fs_yx_fsv4: return "b_fs_yx_fsv4"; case format::b_fs_yx_32fp: diff --git a/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp b/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp index 6dddbca..c4f9daf 100644 --- a/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp +++ b/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp @@ -126,16 +126,8 @@ kernel_selector::data_layout to_data_layout(format f) { return kernel_selector::data_layout::bs_f_bsv8__af8; case format::bs_xs_xsv8_bsv16: return kernel_selector::data_layout::bs_f_bsv16__af8; - case format::bf8_xy16: - return kernel_selector::data_layout::bf8_xy16; case format::winograd_2x3_s1_data: return kernel_selector::data_layout::winograd_2x3_s1_data; - case format::byxf_af32: - return kernel_selector::data_layout::byxf_af32; - case format::byx8_f4: - return kernel_selector::data_layout::byx8_f4; - case format::fs_bs_yx_bsv4_fsv32: - return kernel_selector::data_layout::fs_bs_yx_bsv4_fsv32; case format::b_fs_yx_fsv4: return kernel_selector::data_layout::b_fs_yx_fsv4; case format::b_fs_yx_32fp: @@ -185,16 +177,8 @@ cldnn::format from_data_layout(kernel_selector::data_layout l) { return cldnn::format::bs_xs_xsv8_bsv8; case kernel_selector::data_layout::bs_f_bsv16__af8: return cldnn::format::bs_x_bsv16; - case kernel_selector::data_layout::bf8_xy16: - return cldnn::format::bf8_xy16; case kernel_selector::data_layout::winograd_2x3_s1_data: return cldnn::format::winograd_2x3_s1_data; - case kernel_selector::data_layout::byxf_af32: - return cldnn::format::byxf_af32; - case kernel_selector::data_layout::byx8_f4: - return cldnn::format::byx8_f4; - case kernel_selector::data_layout::fs_bs_yx_bsv4_fsv32: - return cldnn::format::fs_bs_yx_bsv4_fsv32; case kernel_selector::data_layout::b_fs_yx_32fp: return cldnn::format::b_fs_yx_32fp; case kernel_selector::data_layout::bfzyx: @@ -510,20 +494,9 @@ kernel_selector::data_tensor convert_data_tensor(const layout& l, uint32_t split size_t pitch = 1; auto new_vals = vals; - if (ks_layout == kernel_selector::Tensor::byxf_af32) { - new_vals[3] = align_to(vals[3], 32); - } if (ks_layout == kernel_selector::Tensor::b_fs_yx_fsv32) { new_vals[1] = align_to(vals[1], 32); } - if (ks_layout == kernel_selector::Tensor::fs_bs_yx_bsv4_fsv32) { - new_vals[3] = align_to(vals[3], 32); - new_vals[2] = align_to(vals[2], 4); - } - if (ks_layout == kernel_selector::Tensor::byx8_f4) { - new_vals[3] = align_to(vals[3], 4); - new_vals[2] = align_to(vals[2], 8); - } if (ks_layout == kernel_selector::Tensor::bs_fs_yx_bsv16_fsv16) { new_vals[0] = align_to(vals[0], 16); new_vals[1] = align_to(vals[1], 16); diff --git a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp index 04bb824..a44e076 100644 --- a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp +++ b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp @@ -183,22 +183,13 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next, if (next.is_type() && (fmt_prev == format::bfyx || fmt_prev == format::yxfb || fmt_prev == format::b_fs_yx_fsv16 || fmt_prev == format::fs_b_yx_fsv32 || - fmt_prev == format::byxf_af32 || fmt_prev == format::b_fs_yx_fsv32 || + fmt_prev == format::b_fs_yx_fsv32 || (fmt_prev == format::b_fs_yx_fsv4 && prev_output_layout.size.feature[0] % 32 == 0 && prev_output_layout.size.spatial[0] == 1 && prev_output_layout.size.spatial[1] == 1))) return true; - if (next.is_type() && fmt_prev == format::byxf_af32 && fmt_next == format::b_fs_yx_fsv4 && next.as().get_groups() != 1) - return true; - - if (next.is_type() && fmt_prev == format::byxf_af32 && fmt_next == format::bfyx) - return true; - - if (next.is_type() && fmt_prev == format::b_fs_yx_fsv4 && fmt_next == format::byxf_af32 && next.as().get_groups() == 1) - return true; - if (next.is_type() && fmt_prev == format::b_fs_yx_fsv16 && fmt_next == format::b_fs_yx_fsv4 && is_input_idx(0)) return true; @@ -232,7 +223,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next, if (next.is_type() && (fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::bfyx) && prev_output_layout.size.feature[0] == 3 && - (fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::byxf_af32 || + (fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::bs_fs_yx_bsv16_fsv16)) return true; @@ -727,23 +718,6 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout, // nothing to do, just go out from here. } else if (layout_optimizer::convolution_bfyx_opt(current_layout, output_or_weights_layout, prim) || (_output_size_handling_enabled && prim->with_output_size) || node.get_transposed()) { - // commented out due to performance reasons, maybe enable in future - /*if (current_layout.data_type == data_types::f32 && - current_layout.size.batch[0] % 16 == 0 && - current_layout.format == format::bfyx && - output_or_weights_layout.size.spatial[0] == 1 && output_or_weights_layout.size.spatial[1] == 1 && - prim->stride.spatial[0] == 1 && prim->stride.spatial[1] == 1 && - prim->input_offset.spatial[0] == 0 && prim->input_offset.spatial[1] == 0 && - !node.get_transposed()) - { - if (!((current_layout.size.feature[0] % 8) == 0 && (current_layout.size.spatial[0] * - current_layout.size.spatial[1]) == 16 && current_layout.data_padding == padding{ { 0,0,0,0 }, 0 })) - { - expected_tensor = current_layout.size.transform(cldnn::format::bf8_xy16, 1); - expected_format = cldnn::format::bf8_xy16; - } - } - else*/ { expected_tensor = current_layout.size; if (current_layout.format == format::b_fs_zyx_fsv16 || current_layout.format == format::bs_fs_zyx_bsv16_fsv16) diff --git a/inference-engine/thirdparty/clDNN/src/memory_pool.cpp b/inference-engine/thirdparty/clDNN/src/memory_pool.cpp index fc6cec3..290c5e2 100644 --- a/inference-engine/thirdparty/clDNN/src/memory_pool.cpp +++ b/inference-engine/thirdparty/clDNN/src/memory_pool.cpp @@ -254,8 +254,7 @@ memory_impl::ptr memory_pool::get_from_padded_pool(const layout& layout, ((layout.format != format::b_fs_yx_fsv32 && layout.format != format::b_fs_zyx_fsv32) || (layout.size.feature[0] % 32 == 0)) && // TODO: check if this condition always correct - ((layout.format == format::byxf_af32 && layout.size.feature[0] == rec_list._memory->get_layout().size.feature[0]) || - (layout.format != format::byxf_af32 && layout.size.feature[0] <= rec_list._memory->get_layout().size.feature[0])) && + layout.size.feature[0] <= rec_list._memory->get_layout().size.feature[0] && layout.size.batch[0] <= rec_list._memory->get_layout().size.batch[0] && rec_list._memory->get_layout().format != format::fs_b_yx_fsv32 && layout.format != format::fs_b_yx_fsv32 && diff --git a/inference-engine/thirdparty/clDNN/src/program_helpers.cpp b/inference-engine/thirdparty/clDNN/src/program_helpers.cpp index 2ff72f9..a99f1c8 100644 --- a/inference-engine/thirdparty/clDNN/src/program_helpers.cpp +++ b/inference-engine/thirdparty/clDNN/src/program_helpers.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2018-2019 Intel Corporation +// Copyright (c) 2018-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -142,9 +142,7 @@ std::pair program_helpers::are_layouts_identical(layout const& l1, l return {false, false}; if (l1.get_linear_size() != l2.get_linear_size()) return {false, false}; - if ((l1.format == format::bf8_xy16 && l2.format != format::bf8_xy16) || - (l2.format == format::bf8_xy16 && l1.format != format::bf8_xy16) || - (l1.format == format::b_fs_yx_fsv4 && l2.format != format::b_fs_yx_fsv4) || + if ((l1.format == format::b_fs_yx_fsv4 && l2.format != format::b_fs_yx_fsv4) || (l2.format == format::b_fs_yx_fsv4 && l1.format != format::b_fs_yx_fsv4) || (l1.format == format::fs_b_yx_fsv32 && l2.format != format::fs_b_yx_fsv32) || (l2.format == format::fs_b_yx_fsv32 && l1.format != format::fs_b_yx_fsv32) || diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp index 440f2c1..f245b05 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp @@ -6873,7 +6873,7 @@ TEST(convolution_depthwise_gpu_fsv16, depthwise_conv_b_fs_yx_fsv16_in_feature_pa build_options options; options.set_option(build_option::optimize_data(true)); implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; - options.set_option(build_option::force_implementations({ {"conv", conv_impl} })); + options.set_option(build_option::force_implementations({ {"conv", conv_impl} })); network network(engine, topology, options); network.set_input_data("input", input); @@ -6893,7 +6893,7 @@ TEST(convolution_depthwise_gpu_fsv16, depthwise_conv_b_fs_yx_fsv16_in_feature_pa EXPECT_EQ(output_layout.format, format::bfyx); - EXPECT_EQ(y_size, output_size.spatial[1]); + EXPECT_EQ(y_size, output_size.spatial[1]); EXPECT_EQ(x_size, output_size.spatial[0]); EXPECT_EQ(f_size, output_size.feature[0]); EXPECT_EQ(b_size, output_size.batch[0]); @@ -7945,8 +7945,6 @@ INSTANTIATE_TEST_CASE_P( .all_test_params(format::bfyx, false, true) .all_test_params(format::bfyx, true, false) .all_test_params(format::b_fs_yx_fsv4) - // byxf_af32 - depthwise broken for batch > 1 - // .smoke_test_params(format::byxf_af32) .all_test_params(format::b_fs_yx_fsv32) .all_test_params(format::b_fs_yx_fsv32, true, true) .all_test_params(format::b_fs_yx_fsv32, false, true) diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_gpu_test.cpp index 40b9fd6..5e0310b 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_gpu_test.cpp @@ -1336,7 +1336,7 @@ INSTANTIATE_TEST_CASE_P( testing::Values(1, 3), testing::Values(1, 3), testing::Values(3, 32), - testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32) + testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32) ), fully_connected_i8_i8_test::PrintToStringParamName ); @@ -1350,7 +1350,7 @@ INSTANTIATE_TEST_CASE_P( testing::Values(1, 3), testing::Values(1, 3), testing::Values(3, 32), - testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32) + testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32) ), fully_connected_i8_u8_test::PrintToStringParamName ); @@ -1364,7 +1364,7 @@ INSTANTIATE_TEST_CASE_P( testing::Values(1, 3), testing::Values(1, 3), testing::Values(3, 32), - testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32) + testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32) ), fully_connected_i8_f32_test::PrintToStringParamName ); @@ -1378,7 +1378,7 @@ INSTANTIATE_TEST_CASE_P( testing::Values(1, 3), testing::Values(1, 3), testing::Values(3, 32), - testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32) + testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32) ), fully_connected_u8_i8_test::PrintToStringParamName ); @@ -1392,7 +1392,7 @@ INSTANTIATE_TEST_CASE_P( testing::Values(1, 3), testing::Values(1, 3), testing::Values(3, 32), - testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32) + testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32) ), fully_connected_u8_u8_test::PrintToStringParamName ); @@ -1406,7 +1406,7 @@ INSTANTIATE_TEST_CASE_P( testing::Values(1, 3), testing::Values(1, 3), testing::Values(3, 32), - testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32) + testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32) ), fully_connected_u8_f32_test::PrintToStringParamName ); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fused_conv_eltwise_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fused_conv_eltwise_gpu_test.cpp index 91454f1..1ffbf11 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/fused_conv_eltwise_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fused_conv_eltwise_gpu_test.cpp @@ -117,7 +117,6 @@ TEST(fused_conv_eltwise, basic_image2d) EXPECT_EQ(outputs_act.begin()->first, "eltwise"); auto output_act = outputs_act.begin()->second.get_memory(); - auto&& out_act_layout = output_act.get_layout(); auto out_act_ptr = output_act.pointer(); topology topology_ref( @@ -140,7 +139,6 @@ TEST(fused_conv_eltwise, basic_image2d) EXPECT_EQ(outputs_ref.begin()->first, "out"); auto output_ref = outputs_ref.begin()->second.get_memory(); - auto&& out_ref_layout = output_ref.get_layout(); auto out_ref_ptr = output_ref.pointer(); for (int i = 0;i < 3 * 256 * 4;i++) { diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp index a436d31..1a62fab 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp @@ -1373,58 +1373,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_shift_swish, bc_test_params{CASE_CONV3D_S8S8_4, 2, 6}, }), ); - -class conv_int8_byxf_af32 : public ConvFusingTest {}; -TEST_P(conv_int8_byxf_af32, per_channel_coeffs) { - auto p = GetParam(); - create_topologies(input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), - convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data"), - reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) - ); - - implementation_desc conv_impl = { format::byxf_af32, "" }; - bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} })); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(conv_int8_byxf_af32, per_element_coeffs) { - auto p = GetParam(); - create_topologies(input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("eltwise_data", get_mem(get_output_layout(p))), - convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation), - eltwise("eltwise", "conv_prim", "eltwise_data", eltwise_mode::sum), - reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) - ); - - implementation_desc conv_impl = { format::byxf_af32, "" }; - bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} })); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_byxf_af32, - ::testing::ValuesIn(std::vector{ - bc_test_params{CASE_CONV_U8S8_1, 2, 3}, - bc_test_params{CASE_CONV_U8S8_2, 2, 3}, - bc_test_params{CASE_CONV_U8S8_3, 2, 3}, - bc_test_params{CASE_CONV_U8S8_4, 2, 3}, - bc_test_params{CASE_CONV_U8S8_6, 2, 3}, - bc_test_params{CASE_CONV_S8S8_1, 2, 3}, - bc_test_params{CASE_CONV_S8S8_2, 2, 3}, - bc_test_params{CASE_CONV_S8S8_3, 2, 3}, - bc_test_params{CASE_CONV_S8S8_4, 2, 3}, - bc_test_params{CASE_CONV_S8S8_6, 2, 3}, - }), ); - class conv_int8_prelu_eltwise : public ConvFusingTest {}; TEST_P(conv_int8_prelu_eltwise, basic) { auto p = GetParam(); @@ -3452,7 +3400,7 @@ struct activation_test_params { #define CASE_ACTIVATION_F32_2 {7, 3, 7, 7}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx #define CASE_ACTIVATION_F32_3 {1, 14, 8, 8}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx #define CASE_ACTIVATION_F32_4 {1, 17, 31, 29}, data_types::f32, format::yxfb, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F32_5 {1, 17, 31, 29}, data_types::f32, format::byxf_af32, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F32_5 {1, 17, 31, 29}, data_types::f32, format::b_fs_yx_fsv4, data_types::f32, format::bfyx #define CASE_ACTIVATION_F32_6 {1, 17, 31, 29}, data_types::f32, format::b_fs_yx_fsv32, data_types::f32, format::bfyx #define CASE_ACTIVATION_F32_7 {1, 17, 31, 29}, data_types::f32, format::fyxb, data_types::f32, format::bfyx #define CASE_ACTIVATION_3D_F32_0 {3, 16, 13, 13, 13}, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx @@ -3461,14 +3409,13 @@ struct activation_test_params { #define CASE_ACTIVATION_3D_F32_3 {1, 17, 7, 7, 7}, data_types::f32, format::b_fs_zyx_fsv32, data_types::f32, format::bfzyx #define CASE_ACTIVATION_3D_F32_4 {1, 17, 7, 7, 7}, data_types::f32, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfzyx #define CASE_ACTIVATION_3D_F32_5 {1, 17, 7, 7, 7}, data_types::f32, format::fs_b_yx_fsv32, data_types::f32, format::bfzyx -#define CASE_ACTIVATION_3D_F32_6 {1, 17, 7, 7, 7}, data_types::f32, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfzyx #define CASE_ACTIVATION_F16_0 {7, 32, 5, 5}, data_types::f16, format::bfyx, data_types::f32, format::bfyx #define CASE_ACTIVATION_F16_1 {1, 16, 8, 8}, data_types::f16, format::bfyx, data_types::f32, format::bfyx #define CASE_ACTIVATION_F16_2 {7, 16, 7, 7}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx #define CASE_ACTIVATION_F16_3 {1, 14, 8, 8}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx #define CASE_ACTIVATION_F16_4 {1, 17, 31, 29}, data_types::f16, format::yxfb, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F16_5 {1, 17, 31, 29}, data_types::f16, format::byxf_af32, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F16_5 {1, 17, 31, 29}, data_types::f16, format::b_fs_yx_fsv4, data_types::f32, format::bfyx #define CASE_ACTIVATION_F16_6 {1, 17, 31, 29}, data_types::f16, format::b_fs_yx_fsv32, data_types::f32, format::bfyx #define CASE_ACTIVATION_F16_7 {1, 17, 31, 29}, data_types::f16, format::fyxb, data_types::f32, format::bfyx #define CASE_ACTIVATION_3D_F16_0 {3, 16, 13, 13, 13}, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx @@ -3477,7 +3424,6 @@ struct activation_test_params { #define CASE_ACTIVATION_3D_F16_3 {1, 17, 7, 7, 7}, data_types::f16, format::b_fs_zyx_fsv32, data_types::f32, format::bfzyx #define CASE_ACTIVATION_3D_F16_4 {1, 17, 7, 7, 7}, data_types::f16, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfzyx #define CASE_ACTIVATION_3D_F16_5 {1, 17, 7, 7, 7}, data_types::f16, format::fs_b_yx_fsv32, data_types::f32, format::bfzyx -#define CASE_ACTIVATION_3D_F16_6 {1, 17, 7, 7, 7}, data_types::f16, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfzyx #define CASE_ACTIVATION_U8_1 {1, 16, 8, 8}, data_types::u8, format::bfyx, data_types::f32, format::bfyx #define CASE_ACTIVATION_U8_2 {1, 12, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx @@ -3572,7 +3518,6 @@ INSTANTIATE_TEST_CASE_P( activation_test_params{CASE_ACTIVATION_F32_7, 2, 3, "activation_ref"}, // FIXME - accuracy bug activation_test_params{CASE_ACTIVATION_3D_F32_3, 2, 3, "activation_ref"}, // FIXME - accuracy bug activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 3, "activation_ref"}, // FIXME - accuracy bug - activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 3, "activation_ref"}, // FIXME - accuracy bug }), ); class activation_scale_activation_quantize_u8 : public ActivationFusingTest {}; @@ -3640,7 +3585,6 @@ INSTANTIATE_TEST_CASE_P( activation_scale_activation_quantize_u8, ::testing::ValuesIn(std::vector{ activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 5, "activation_ref"}, // FIXME - accuracy bug - activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 5, "activation_ref"}, // FIXME - accuracy bug }), ); class activation_scale_activation : public ActivationFusingTest {}; @@ -3697,8 +3641,7 @@ INSTANTIATE_TEST_CASE_P( activation_test_params{CASE_ACTIVATION_3D_F16_1, 2, 4, "activation_ref"}, activation_test_params{CASE_ACTIVATION_3D_F16_2, 2, 4, "activation_ref"}, activation_test_params{CASE_ACTIVATION_3D_F16_3, 2, 4, "activation_ref"}, - activation_test_params{CASE_ACTIVATION_3D_F16_4, 2, 4, "activation_ref"}, - activation_test_params{CASE_ACTIVATION_3D_F16_5, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_3D_F16_4, 2, 4, "activation_ref"}, // InputDataType = UINT8 activation_test_params{CASE_ACTIVATION_U8_1, 2, 4, "activation_ref"}, @@ -3719,8 +3662,6 @@ INSTANTIATE_TEST_CASE_P( ::testing::ValuesIn(std::vector{ activation_test_params{CASE_ACTIVATION_3D_F32_4, 2, 4, "activation_ref"}, // FIXME - accuracy bug activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 4, "activation_ref"}, // FIXME - accuracy bug - activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 4, "activation_ref"}, // FIXME - accuracy bug - activation_test_params{CASE_ACTIVATION_3D_F16_6, 2, 4, "activation_ref"}, // FIXME - accuracy bug }), ); /* ----------------------------------------------------------------------------------------------------- */ @@ -4450,30 +4391,21 @@ struct pooling_test_params { #define CASE_POOLING_U8_1 {1, 16, 8, 8}, data_types::u8, format::bfyx, data_types::f32, format::bfyx #define CASE_POOLING_U8_2 {2, 16, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx #define CASE_POOLING_U8_3 {1, 32, 10, 10}, data_types::u8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4 -#define CASE_POOLING_U8_4 {1, 32, 10, 10}, data_types::u8, format::byxf_af32, data_types::f32, format::bfyx #define CASE_POOLING_U8_5 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx #define CASE_POOLING_U8_6 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx #define CASE_POOLING_U8_FP16_3 {1, 32, 10, 10}, data_types::u8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4 -#define CASE_POOLING_U8_FP16_4 {1, 32, 10, 10}, data_types::u8, format::byxf_af32, data_types::f16, format::bfyx #define CASE_POOLING_U8_FP16_5 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx #define CASE_POOLING_U8_FP16_6 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx #define CASE_POOLING_I8_1 {1, 16, 8, 8}, data_types::i8, format::bfyx, data_types::f32, format::bfyx #define CASE_POOLING_I8_2 {2, 16, 8, 8}, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_POOLING_I8_4 {1, 32, 10, 10}, data_types::i8, format::byxf_af32, data_types::f32, format::bfyx #define CASE_POOLING_I8_5 {1, 32, 10, 10}, data_types::i8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4 #define CASE_POOLING_I8_6 {16, 32, 10, 10, 10}, data_types::i8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx -#define CASE_POOLING_I8_FP16_4 {1, 32, 10, 10}, data_types::i8, format::byxf_af32, data_types::f16, format::bfyx #define CASE_POOLING_I8_FP16_5 {1, 32, 10, 10}, data_types::i8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4 #define CASE_POOLING_I8_FP16_6 {16, 32, 10, 10, 10}, data_types::i8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx -// Disabled -#define CASE_POOLING_I8_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfyx -#define CASE_POOLING_I8_FP16_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f16, format::bfyx -#define CASE_POOLING_I8_FP16_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f16, format::bfyx - class PoolingFusingTest : public ::BaseFusingTest { public: void execute(pooling_test_params& p) { @@ -4674,8 +4606,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, pooling_test_params{CASE_POOLING_F32_10, 2, 5, pooling_mode::max, "pooling_gpu_bsv16_fsv16"}, // Input type: INT8 - pooling_test_params{CASE_POOLING_I8_4, 2, 5, pooling_mode::average, "pooling_gpu_byxf_af32"}, - pooling_test_params{CASE_POOLING_I8_4, 2, 5, pooling_mode::max, "pooling_gpu_byxf_af32"}, pooling_test_params{CASE_POOLING_I8_5, 2, 5, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"}, pooling_test_params{CASE_POOLING_I8_5, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"}, pooling_test_params{CASE_POOLING_I8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"}, @@ -4688,8 +4618,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, pooling_test_params{CASE_POOLING_U8_3, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"}, pooling_test_params{CASE_POOLING_U8_5, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"}, pooling_test_params{CASE_POOLING_U8_5, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"}, - pooling_test_params{CASE_POOLING_U8_4, 2, 5, pooling_mode::average, "pooling_gpu_byxf_af32"}, - pooling_test_params{CASE_POOLING_U8_4, 2, 5, pooling_mode::max, "pooling_gpu_byxf_af32"}, pooling_test_params{CASE_POOLING_U8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"}, pooling_test_params{CASE_POOLING_U8_6, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"}, }), ); @@ -4697,9 +4625,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, INSTANTIATE_TEST_CASE_P(DISABLED_fusings_gpu, pooling_scale_activation_quantize, ::testing::ValuesIn(std::vector{ - pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"}, - pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"}, - pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::average, "pooling_gpu_fs_bs_yx_bsv4_fsv32"}, pooling_test_params{CASE_POOLING_F32_3, 2, 5, pooling_mode::average, "pooling_gpu_average_opt"}, //currently not enabled, fusing not upported }), ); @@ -4742,8 +4667,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, pooling_test_params{CASE_POOLING_F32_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"}, // Input type: INT8 - pooling_test_params{CASE_POOLING_I8_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"}, - pooling_test_params{CASE_POOLING_I8_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"}, pooling_test_params{CASE_POOLING_I8_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"}, pooling_test_params{CASE_POOLING_I8_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"}, pooling_test_params{CASE_POOLING_I8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"}, @@ -4754,8 +4677,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"}, pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"}, pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"}, - pooling_test_params{CASE_POOLING_U8_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"}, - pooling_test_params{CASE_POOLING_U8_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"}, pooling_test_params{CASE_POOLING_U8_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"}, pooling_test_params{CASE_POOLING_U8_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"}, pooling_test_params{CASE_POOLING_U8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"}, @@ -4822,8 +4743,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, pooling_test_params{CASE_POOLING_F32_F16_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"}, // Input type: INT8 - pooling_test_params{CASE_POOLING_I8_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"}, - pooling_test_params{CASE_POOLING_I8_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"}, pooling_test_params{CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"}, pooling_test_params{CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"}, pooling_test_params{CASE_POOLING_I8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"}, @@ -4834,24 +4753,12 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"}, pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"}, pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"}, - pooling_test_params{CASE_POOLING_U8_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"}, - pooling_test_params{CASE_POOLING_U8_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"}, pooling_test_params{CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"}, pooling_test_params{CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"}, pooling_test_params{CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"}, pooling_test_params{CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"}, }), ); -INSTANTIATE_TEST_CASE_P(DISABLED_fusings_gpu, - pooling_scale_activation, - ::testing::ValuesIn(std::vector{ - pooling_test_params{CASE_POOLING_I8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"}, - pooling_test_params{CASE_POOLING_I8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"}, - pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"}, - pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"}, - pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::average, "pooling_gpu_fs_bs_yx_bsv4_fsv32"}, - }), ); - /* ----------------------------------------------------------------------------------------------------- */ /* -------------------------------- DepthToSpace cases ------------------------------------------------- */ /* ----------------------------------------------------------------------------------------------------- */ diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp index 2ac9813..4712bc3 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp @@ -2619,7 +2619,6 @@ INSTANTIATE_TEST_CASE_P( testing::Values(std::tuple(0, 0, 0)), testing::Values(format::yxfb, format::bfyx, - format::byxf_af32, format::b_fs_yx_fsv4, format::b_fs_yx_fsv16, format::b_fs_yx_fsv32)), @@ -2737,8 +2736,7 @@ INSTANTIATE_TEST_CASE_P( format::b_fs_yx_fsv16, format::fs_b_yx_fsv32, format::b_fs_yx_fsv32, - format::b_fs_yx_fsv4, - format::fs_bs_yx_bsv4_fsv32)), + format::b_fs_yx_fsv4)), testing::internal::DefaultParamName); TEST(pooling_forward_gpu, bsv16_fsv16_max_16x16x8x8_input_2x2_pool_2x2_stride) diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/resample_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/resample_gpu_test.cpp index d06f938..f6c6dd4 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/resample_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/resample_gpu_test.cpp @@ -766,8 +766,6 @@ INSTANTIATE_TEST_CASE_P(smoke, resample_random_test, testing::ValuesIn( resample_random_test_param_generator() - .smoke_params(data_types::i8, format::byxf_af32, format::byxf_af32) - .smoke_params(data_types::u8, format::byxf_af32, format::byxf_af32) .smoke_params(data_types::i8, format::b_fs_yx_fsv4, format::b_fs_yx_fsv4) .smoke_params(data_types::u8, format::b_fs_yx_fsv4, format::b_fs_yx_fsv4) .smoke_params(data_types::i8, format::b_fs_yx_fsv16, format::b_fs_yx_fsv16) diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/tensor_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/tensor_test.cpp index 307baa9..8f6200b 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/tensor_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/tensor_test.cpp @@ -105,16 +105,4 @@ TEST(tensor_api, linear_offsets) { test_tensor_offset({ 2, 19, 4, 3 }, { 1, 18, 3, 2 }, cldnn::format::b_fs_yx_fsv16, 754); test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::fs_b_yx_fsv32, 675); test_tensor_offset({ 2, 37, 4, 3 }, { 1, 35, 3, 2 }, cldnn::format::fs_b_yx_fsv32, 1507); - - // Formats with alignment: - test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::byxf_af32, 675); - test_tensor_offset({ 2, 37, 4, 3 }, { 1, 35, 3, 2 }, cldnn::format::byxf_af32, 1507); - test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::byx8_f4, 331); - test_tensor_offset({ 2, 37, 4, 3 }, { 1, 35, 3, 2 }, cldnn::format::byx8_f4, 1755); - - // Non-standard blocked formats: - // bf8_xy16 - b_fs_es_fsv8_esv16, where e is flattened yx := x + y * size_x - test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::bf8_xy16, 185); - test_tensor_offset({ 2, 19, 7, 3 }, { 1, 18, 3, 2 }, cldnn::format::bf8_xy16, 1441); - } -- 2.7.4