[IE CLDNN] Cleanup part 2 (#1865)
authorVladimir Paramuzov <vladimir.paramuzov@intel.com>
Thu, 27 Aug 2020 20:06:20 +0000 (23:06 +0300)
committerGitHub <noreply@github.com>
Thu, 27 Aug 2020 20:06:20 +0000 (23:06 +0300)
* [IE CLDNN] Removed some unused kernels and layouts

* [IE CLDNN] Removed bsv4_fsv32 layout

* [IE CLDNN] Removed remaining BF8_XY16 usages. Removed definitions.cl

87 files changed:
inference-engine/thirdparty/clDNN/api/layout.hpp
inference-engine/thirdparty/clDNN/api/tensor.hpp
inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.cpp
inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_ref.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_1x1.cpp [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_1x1.h [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_depthwise.cpp [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_depthwise.h [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad.cpp [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad.h [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.cpp [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.h [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.cpp [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.h [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.cpp [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.h [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv4.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_blocks.cpp [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_blocks.h [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.cpp [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.h [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.cpp [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.h [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_along_f_tile_bfx.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_ref.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_vload8.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.cpp [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_scale_shift_opt.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.cpp [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.h [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_fast_b1.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_selector.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_ref.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_1x1.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_af32_depthwise.cl [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_blocks.cl [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_af32_imad_1x1.cl [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_af32.cl [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_int8_ref.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_biplanar_nv12.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_byxf_f32_to_byx8_f4_i8.cl [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_fast_b1.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_to_yxfb_batched.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp
inference-engine/thirdparty/clDNN/src/convolution.cpp
inference-engine/thirdparty/clDNN/src/fused_conv_eltwise.cpp
inference-engine/thirdparty/clDNN/src/gpu/concatenation_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/convolution_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/definitions.cl [deleted file]
inference-engine/thirdparty/clDNN/src/gpu/eltwise_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/fully_connected_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/fused_conv_eltwise_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/quantize_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/resample_gpu.cpp
inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp
inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_padding.cpp
inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
inference-engine/thirdparty/clDNN/src/graph_optimizer/reorder_inputs.cpp
inference-engine/thirdparty/clDNN/src/include/to_string_utils.h
inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp
inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
inference-engine/thirdparty/clDNN/src/memory_pool.cpp
inference-engine/thirdparty/clDNN/src/program_helpers.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/fused_conv_eltwise_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/resample_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/tensor_test.cpp

index af2c407..ee2ac02 100644 (file)
@@ -344,14 +344,6 @@ struct layout {
     tensor get_pitches() const {
         auto sizes = get_buffer_size().sizes(format);
 
-        if (format == format::byxf_af32) {
-            sizes[3] = align_to(sizes[3], 32);
-        }
-
-        if (format == format::byx8_f4) {
-            sizes[3] = align_to(sizes[3], 4);
-            sizes[2] = align_to(sizes[2], 8);
-        }
         std::vector<tensor::value_type> pitches(sizes.size(), tensor::value_type(1));
         std::partial_sum(sizes.rbegin(), sizes.rend() - 1, pitches.rbegin() + 1, std::multiplies<tensor::value_type>());
         return {format, pitches};
@@ -394,15 +386,7 @@ struct layout {
             sizes[block_axis] = align_to(sizes[block_axis], block_size);
         }
 
-        if (this->format == cldnn::format::bf8_xy16 && !(is_aligned_to(sizes[1], 8) && is_aligned_to(sizes[2] * sizes[3], 16))) {
-            sizes[3] = align_to(sizes[2] * sizes[3], 16);
-            sizes[2] = 1;
-        } else if (this->format == cldnn::format::byxf_af32 && !(is_aligned_to(sizes[1], 32))) {
-            sizes[1] = align_to(sizes[1], 32);
-        } else if (this->format == cldnn::format::byx8_f4 && (!is_aligned_to(sizes[1], 4) || !is_aligned_to(sizes[2], 8))) {
-            sizes[1] = align_to(sizes[1], 4);
-            sizes[2] = align_to(sizes[2], 8);
-        } else if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4 && !(is_aligned_to(sizes[0], 8)) && !(is_aligned_to(sizes[1], 32))) {
+        if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4 && !(is_aligned_to(sizes[0], 8)) && !(is_aligned_to(sizes[1], 32))) {
             sizes[0] = align_to(sizes[0], 8);
             sizes[1] = align_to(sizes[1], 32);
         } else if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4_swizzled_by_4 && !(is_aligned_to(sizes[0], 32)) && !(is_aligned_to(sizes[1], 32))) {
index 4c89063..71c7c3e 100644 (file)
@@ -105,7 +105,6 @@ struct format {
         bs_fs_zyx_bsv16_fsv16,                  ///< format used for 3D blocked convolution (batch and features blocked by 16)
         bs_fs_yx_bsv16_fsv16,                   ///< format used for 2D blocked convolution (batch and features blocked by 16)
         fs_b_yx_fsv32,                          ///< format for input for fp16 primitives
-        fs_bs_yx_bsv4_fsv32,                    ///< format for batched input for primitives using MMAD
         b_fs_yx_fsv4,                           ///< format for input for IMAD convolutions
         bs_xs_xsv8_bsv8,                        ///< format used only for fully connected weights: bs - batch slice,
                                                 ///< xs - x slice, bsv8 - 8 values of single slice.
@@ -114,10 +113,6 @@ struct format {
         bs_x_bsv16,                             ///< format used only for fully connected weights fp16 batch=1 : bs - batch slice
                                                 ///< (responses slice), bsv16 - 16 values of single batch slice, x - flattened plane of (fyx)
                                                 ///< \n \image html bs_x_bsv16.jpg
-        byxf_af32,                              ///< format for input for primitives using MMAD
-        byx8_f4,                                ///< format for input for MMAD convolutions
-        bf8_xy16,                               ///< format used only for convolution 1x1 input, xy aligned to 16, f aligned to 8
-                                                ///< \n \image html bf8_xy16.jpg
         b_fs_yx_32fp,                           ///< format for data for binary convolutions
         winograd_2x3_s1_data,                   ///< format used for input for winograd convolution, F(2,3) -- filter 3x3 with stride 1
         nv12,                                   ///< format for media nv12 input
@@ -227,11 +222,7 @@ struct format {
                 { bs_xs_xsv8_bsv8,       { 1, 1, 1, 0, 0, "bx",     "b?x??",  {{2, 8}, {0, 8}}}},
                 { bs_xs_xsv8_bsv16,      { 1, 1, 1, 0, 0, "bx",     "b?x??",  {{2, 8}, {0, 16}}}},
                 { bs_x_bsv16,            { 1, 1, 1, 0, 0, "bx",     "b?x??",  {{0, 16}}}},
-                { bf8_xy16,              { 1, 1, 2, 0, 0, "bfyx",   "bfxy?",  {{1, 8}}}},
                 { winograd_2x3_s1_data,  { 1, 1, 2, 0, 0, "bxyf",   "bfxy?",  {}}},
-                { byxf_af32,             { 1, 1, 2, 0, 0, "byxf",   "bfxy?",  {}}},
-                { byx8_f4 ,              { 1, 1, 2, 0, 0, "byxf",   "bfxy?",  {}}},
-                { fs_bs_yx_bsv4_fsv32,   { 1, 1, 2, 0, 0, "fbyx",   "bfxy?",  {{0, 4}, {1, 32}}}},
                 { b_fs_yx_fsv4,          { 1, 1, 2, 0, 0, "bfyx",   "bfxy?",  {{1, 4}}}},
                 { bfzyx,                 { 1, 1, 3, 0, 0, "bfzyx",  "bfxyz",  {}}},
                 { bfwzyx,                { 1, 1, 4, 0, 0, "bfwzyx", "bfxyzw", {}}},
@@ -943,23 +934,7 @@ public:
             adjusted_coords[external_axis] /= block_size;
         }
 
-        if (fmt == cldnn::format::byxf_af32 && !(is_aligned_to(my_sizes[3], 32))) {
-            my_sizes[3] = align_to(my_sizes[3], 32);
-        } else if (fmt == cldnn::format::byx8_f4 && (!(is_aligned_to(my_sizes[3], 4)) || !(is_aligned_to(my_sizes[2], 8)))) {
-            my_sizes[3] = align_to(my_sizes[3], 4);
-            my_sizes[2] = align_to(my_sizes[2], 8);
-        } else if (fmt == cldnn::format::bf8_xy16) {
-            // Special case of blocked format, where xy is treated as one flattened dimension
-            auto flat_xy = adjusted_coords[3] + adjusted_coords[2] * my_sizes[3];
-
-            my_sizes.push_back(16);
-            my_sizes[3] = ceil_div(my_sizes[2] * my_sizes[3], 16);
-            my_sizes[2] = 1;
-
-            adjusted_coords.push_back(flat_xy % 16);
-            adjusted_coords[3] = flat_xy / 16;
-            adjusted_coords[2] = 0;
-        } else if (fmt == cldnn::format::os_is_yx_isa8_osv8_isv4 &&  // TODO Fix offsets calculation for formats below
+        if (fmt == cldnn::format::os_is_yx_isa8_osv8_isv4 &&  // TODO Fix offsets calculation for formats below
                    !(is_aligned_to(my_sizes[0], 8)) &&
                    !(is_aligned_to(my_sizes[1], 32))) {
             my_sizes[0] = align_to(my_sizes[0], 8);
index 217b11f..e8f3b67 100644 (file)
@@ -42,11 +42,7 @@ DataTensor::DataChannelArray DataTensor::dataChannelArray {{
     { DataLayout::bs_fs_yx_bsv16_fsv16,  {  0,  1, -1, -1,  2,  3 } },
     { DataLayout::bs_f_bsv8__af8,        { -1, -1, -1, -1,  0,  1 } },
     { DataLayout::bs_f_bsv16__af8,       { -1, -1, -1, -1,  0,  1 } },
-    { DataLayout::bf8_xy16,              {  0,  1, -1, -1,  2,  3 } },
     { DataLayout::winograd_2x3_s1_data,  {  2,  1, -1, -1,  0,  3 } },
-    { DataLayout::byxf_af32,             {  1,  2, -1, -1,  0,  3 } },
-    { DataLayout::byx8_f4,               {  1,  2, -1, -1,  0,  3 } },
-    { DataLayout::fs_bs_yx_bsv4_fsv32,   {  0,  1, -1, -1,  3,  2 } },
     { DataLayout::b_fs_yx_fsv4,          {  0,  1, -1, -1,  2,  3 } },
     { DataLayout::bfzyx,                 {  0,  1,  2, -1,  3,  4 } },
     { DataLayout::fs_b_yx_fsv32,         {  0,  1, -1, -1,  3,  2 } },
@@ -167,26 +163,6 @@ NDims DataTensor::GetSimpleDims(const std::vector<size_t>& d, DataLayout l) {
             assert(newDims.size() == 5);
             newDims[3] = RoundUp(newDims[3], 32);
             break;
-        case bf8_xy16:
-            assert(newDims.size() == 4);
-            newDims[1] = RoundUp(newDims[1], 8);
-            newDims[3] = RoundUp(newDims[2] * newDims[3], 16);
-            newDims[2] = 1;
-            break;
-        case byxf_af32:
-            assert(newDims.size() == 4);
-            newDims[0] = RoundUp(newDims[0], 32);
-            break;
-        case byx8_f4:
-            assert(newDims.size() == 4);
-            newDims[0] = RoundUp(newDims[0], 4);
-            newDims[1] = RoundUp(newDims[1], 8);
-            break;
-        case fs_bs_yx_bsv4_fsv32:
-            assert(newDims.size() == 4);
-            newDims[3] = RoundUp(newDims[3], 32);
-            newDims[2] = RoundUp(newDims[2], 4);
-            break;
         case b_fs_yx_32fp:
             assert(newDims.size() == 4);
             newDims[3] = RoundUp(newDims[3], 32);
@@ -222,14 +198,6 @@ NDims DataTensor::GetSimpleDims(const std::vector<size_t>& d, DataLayout l) {
         pitch *= newDims[i];
     }
 
-    if (l == byxf_af32 || l == fs_bs_yx_bsv4_fsv32 || l == byx8_f4) {
-        ret[0].pitch = 1;
-        ret[1].pitch = ret[0].pitch * newDims[0];
-        ret[2].pitch = ret[1].pitch * newDims[1];
-        ret[3].pitch = ret[2].pitch * newDims[2];
-        ret[4].pitch = ret[3].pitch * newDims[3];
-    }
-
     return ret;
 }
 
index 3a3ff41..af5c8a5 100644 (file)
@@ -51,11 +51,7 @@ enum DataLayout {
     bs_fs_zyx_bsv16_fsv16,  // batch, feature, 3D spatial. Blocks of 16 batch and channels
     bs_f_bsv8__af8,         // for optimized FC
     bs_f_bsv16__af8,        // for optimized FC
-    bf8_xy16,               // for optimized conv1x1
     winograd_2x3_s1_data,   // winograd convolution input, F(2,3) -- filter 3x3 with stride 1
-    byxf_af32,              // for MMAD convolution
-    byx8_f4,                // for MMAD convolution
-    fs_bs_yx_bsv4_fsv32,    // for batched MMAD
     b_fs_yx_fsv4,           // reordering format for swizzled input for convolution using IMAD
     bfzyx,                  // batch+feature+3D spatial
     fs_b_yx_fsv32,          // for FP16 kernels, 32 features to avoid partial writes
index 77dc1f1..fa5cd64 100644 (file)
@@ -41,7 +41,6 @@ ParamsKey ConcatenationKernelRef::GetSupportedKey() const {
     k.EnableInputLayout(DataLayout::byxf);
     k.EnableInputLayout(DataLayout::fyxb);
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
-    k.EnableInputLayout(DataLayout::byxf_af32);
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
     k.EnableOutputLayout(DataLayout::bf);
@@ -51,7 +50,6 @@ ParamsKey ConcatenationKernelRef::GetSupportedKey() const {
     k.EnableOutputLayout(DataLayout::byxf);
     k.EnableOutputLayout(DataLayout::fyxb);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
-    k.EnableOutputLayout(DataLayout::byxf_af32);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
     k.EnableTensorOffset();
index 7874629..b016fe7 100644 (file)
@@ -25,11 +25,9 @@ ParamsKey ConvolutionKernel_bfyx_1x1::GetSupportedKey() const {
     k.EnableOutputDataType(Datatype::F32);
     k.EnableInputWeightsType(WeightsType::F16);
     k.EnableInputWeightsType(WeightsType::F32);
-    k.EnableInputLayout(DataLayout::bf8_xy16);
     k.EnableInputLayout(DataLayout::bfyx);
     k.EnableOutputLayout(DataLayout::bfyx);
     k.EnableOutputLayout(DataLayout::yxfb);
-    k.EnableOutputLayout(DataLayout::bf8_xy16);
     k.EnableTensorOffset();
     k.EnableTensorPitches();
     k.EnableDilation();
index cb1d2d3..dd193c2 100644 (file)
@@ -82,7 +82,6 @@ ParamsKey ConvolutionKernel_imad::GetSupportedKey() const {
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
 
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
-    k.EnableOutputLayout(DataLayout::byxf_af32);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
     k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
 
@@ -116,8 +115,6 @@ JitConstants ConvolutionKernel_imad::GetJitConstants(const convolution_params& p
         in_fsv = 4;
     else if (params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16)
         in_fsv = 16;
-    else if (params.inputs[0].GetLayout() == DataLayout::byxf_af32)
-        in_fsv = 32;
 
     mem_consts.AddConstants({
         MakeJitConstant("_ID", RoundUp(input.Feature().v, in_fsv)),
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_1x1.cpp
deleted file mode 100644 (file)
index 6fa7131..0000000
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
-// Copyright (c) 2019-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_imad_byxf_af32_1x1.h"
-
-static size_t GetTileLength(size_t out_xy, size_t out_f, size_t min_threads) {
-    for (int tile_len = 14; tile_len > 0; tile_len--) {
-        // Kernel writes 32 output features per HW thread
-        size_t threads = (out_xy / tile_len) * out_xy * out_f / 32;
-        // Chose largest valid tile with enough HW threads
-        if ((out_xy % tile_len == 0) && (threads >= min_threads)) {
-            return tile_len;
-        }
-    }
-    return 1;
-}
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_imad_byxf_af32_1x1::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputDataType(Datatype::UINT8);
-    k.EnableOutputDataType(Datatype::UINT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::byxf_af32);
-    k.EnableOutputLayout(DataLayout::byxf_af32);
-    k.EnableDifferentTypes();
-    k.EnableDifferentInputWeightsTypes();
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableDilation();
-    k.EnableBiasPerFeature();
-    k.EnableBiasPerOutput();
-    k.EnableNonBiasTerm();
-    k.EnableBatching();
-    k.EnableQuantization(QuantizationType::SYMMETRIC);
-    k.DisableTuning();
-    return k;
-}
-
-bool ConvolutionKernel_imad_byxf_af32_1x1::Validate(const Params& p, const optional_params& o) const {
-    if (!Parent::Validate(p, o)) {
-        return false;
-    }
-
-    const auto& params = static_cast<const convolution_params&>(p);
-
-    if (params.filterSize.x != 1 || params.filterSize.y != 1)
-        return false;
-
-    if (params.padding.x != 0 || params.padding.y != 0)
-        return false;
-
-    if (params.output.Feature().v % 32 != 0)
-        return false;
-
-    const auto& input = params.inputs[0];
-
-    // we do not support padded input
-    if (input.X().pad.Total() != 0 || input.Y().pad.Total() != 0)
-        return false;
-
-    if (params.split != 1)
-        return false;
-
-    if (params.groups != 1)
-        return false;
-
-    return true;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_byxf_af32_1x1::SetDefault(const convolution_params& arg,
-                                                                                int) const {
-    DispatchData runInfo = Parent::SetDefault(arg);
-
-    // Sub-group size
-    constexpr size_t sub_group_size = 8;
-
-    const auto of_maps = arg.output.Feature().v;
-    const size_t of_maps_per_batch = RoundUp(of_maps, 32);
-    const size_t of_maps_total = of_maps_per_batch * arg.output.Batch().v;
-
-    // Need to have at least 4 HW threads per EU
-    const size_t tile_length = GetTileLength(arg.output.X().v, of_maps_total, arg.engineInfo.computeUnitsCount * 4);
-    runInfo.cldnnStyle.blockWidth = tile_length;
-
-    runInfo.efficiency = FORCE_PRIORITY_1;
-
-    runInfo.gws0 = arg.output.X().v * arg.output.Y().v / tile_length;
-    runInfo.gws1 = of_maps_total / 4;  // TILE_DEPTH==4
-    runInfo.gws2 = 1;
-
-    runInfo.lws0 = 1;
-    runInfo.lws1 = sub_group_size;
-    runInfo.lws2 = 1;
-
-    return runInfo;
-}
-
-JitConstants ConvolutionKernel_imad_byxf_af32_1x1::GetJitConstants(const convolution_params& params,
-                                                              const DispatchData& runInfo) const {
-    auto jit = Parent::GetJitConstants(params, runInfo);
-
-    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws1));
-    jit.AddConstant(MakeJitConstant("TILE_LENGTH", runInfo.cldnnStyle.blockWidth));
-    jit.AddConstant(MakeJitConstant("TILE_DEPTH", 4));
-
-    jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED"));
-
-    if (!params.fused_ops.empty()) {
-        auto input_dt = GetActivationType(params);
-        FusedOpsConfiguration conf_scalar = {"", {"b", "f2", "y", "(x+i)"}, "res", input_dt, 1 };
-        jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
-    }
-
-    return jit;
-}
-
-KernelsData ConvolutionKernel_imad_byxf_af32_1x1::GetKernelsData(const Params& params,
-                                                            const optional_params& options) const {
-    return GetTunedKernelsDataByIndex(params, options);
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_1x1.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_1x1.h
deleted file mode 100644 (file)
index 48911ad..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_imad_byxf_af32_1x1 : public ConvolutionKernelBase {
-public:
-    using Parent = ConvolutionKernelBase;
-    ConvolutionKernel_imad_byxf_af32_1x1() : ConvolutionKernelBase("fused_conv_eltwise_gpu_af32_imad_1x1") {}
-    virtual ~ConvolutionKernel_imad_byxf_af32_1x1() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
-    bool Validate(const Params& p, const optional_params& o) const override;
-    WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
-        return WeightsLayout::os_is_osv32_isv32_swizzled_by_4;
-    }
-    std::vector<FusedOpType> GetSupportedFusedOps() const override {
-        return { FusedOpType::ELTWISE,
-                 FusedOpType::QUANTIZE,
-                 FusedOpType::SCALE,
-                 FusedOpType::ACTIVATION };
-    }
-};
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_depthwise.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_depthwise.cpp
deleted file mode 100644 (file)
index 002c54d..0000000
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
-// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_imad_byxf_af32_depthwise.h"
-
-#define SIMD_SIZE 16
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_imad_byxf_af32_depthiwise::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableInputDataType(Datatype::UINT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::UINT8);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::byxf_af32);
-    k.EnableOutputLayout(DataLayout::byxf_af32);
-    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableDilation();
-    k.EnableBiasPerFeature();
-    k.EnableBiasPerOutput();
-    k.EnableNonBiasTerm();
-    k.EnableBatching();
-    k.EnableSplitSupport();
-    k.EnableQuantization(QuantizationType::SYMMETRIC);
-    k.EnableDepthwiseSeparableOpt();
-    k.EnableDifferentTypes();
-    k.EnableDifferentInputWeightsTypes();
-    k.DisableTuning();
-    k.EnableGroupedConvolution();
-    return k;
-}
-
-static size_t GetTileLength(size_t out_x) {
-    for (int i = 20; i >= 1; i--) {
-        if (out_x % i == 0)
-            return i;
-    }
-    return 1;
-}
-
-static int GetSplit(size_t out_x, int stride) {
-    if (out_x >= 75) {
-        if (stride > 1)
-            return 1;
-        else
-            return 3;
-    }
-
-    if (out_x == 38 && stride == 2)
-        return 2;
-
-    if (out_x < 75) {
-        if (stride > 1)
-            return 1;
-        else if (out_x % 2 == 0)
-            return 2;
-    }
-    return 1;
-}
-
-bool ConvolutionKernel_imad_byxf_af32_depthiwise::Validate(const Params& p, const optional_params& o) const {
-    if (!Parent::Validate(p, o)) {
-        return false;
-    }
-
-    const convolution_params& cp = static_cast<const convolution_params&>(p);
-    if (cp.inputs[0].Feature().v != cp.groups || cp.output.Feature().v != cp.groups || cp.groups == 1) {
-        return false;
-    }
-
-    return true;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_byxf_af32_depthiwise::SetDefault(const convolution_params& arg,
-    int) const {
-    DispatchData runInfo = Parent::SetDefault(arg);
-
-    runInfo.efficiency = FORCE_PRIORITY_1;
-
-    runInfo.gws0 = Align(arg.output.Feature().v, SIMD_SIZE) * arg.output.Batch().v;
-    runInfo.gws1 = arg.output.X().v / GetTileLength(arg.output.X().v);
-    runInfo.gws2 = CeilDiv(arg.output.Y().v, GetSplit(arg.output.Y().v, arg.stride.y));
-
-    std::vector<size_t> local = { SIMD_SIZE, 1, 1 };
-
-    runInfo.lws0 = local[0];
-    runInfo.lws1 = local[1];
-    runInfo.lws2 = local[2];
-
-    return runInfo;
-}
-
-JitConstants ConvolutionKernel_imad_byxf_af32_depthiwise::GetJitConstants(const convolution_params& params,
-                                                            const DispatchData& runInfo) const {
-    auto jit = Parent::GetJitConstants(params, runInfo);
-
-    jit.AddConstant(MakeJitConstant("ALIGNED_OFM", Align(params.output.Feature().v, SIMD_SIZE)));
-    jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", GetTileLength(params.output.X().v)));
-    jit.AddConstant(MakeJitConstant("SPLIT_Y", GetSplit(params.output.Y().v, params.stride.y)));
-    jit.AddConstant(MakeJitConstant("SIMD_SIZE", SIMD_SIZE));
-
-    if (params.output.Y().v % GetSplit(params.output.Y().v, params.stride.y) != 0)
-        jit.AddConstant(MakeJitConstant("SPLIT_LEFTOVERS", params.output.Y().v % GetSplit(params.output.Y().v, params.stride.y)));
-
-    if (!params.fused_ops.empty()) {
-        auto input_dt = GetActivationType(params);
-        FusedOpsConfiguration conf_scalar = {"", {"b", "of", "(y+m)", "(x+l)"}, "res", input_dt, 1 };
-        conf_scalar.SetLoopAxes({Tensor::DataChannelName::Y, Tensor::DataChannelName::X});
-        jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
-    }
-
-    return jit;
-}
-
-
-KernelsData ConvolutionKernel_imad_byxf_af32_depthiwise::GetKernelsData(const Params& params,
-                                                                   const optional_params& options) const {
-    KernelsData kd = GetTunedKernelsDataByIndex(params, options);
-    if (!kd.empty())
-        kd[0].estimatedTime = FORCE_PRIORITY_1;
-    return kd;
-}
-
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_depthwise.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_byxf_af32_depthwise.h
deleted file mode 100644 (file)
index 0e70c95..0000000
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_imad_byxf_af32_depthiwise : public ConvolutionKernelBase {
-public:
-    using Parent = ConvolutionKernelBase;
-    ConvolutionKernel_imad_byxf_af32_depthiwise() : ConvolutionKernelBase("convolution_gpu_byxf_af32_depthwise") {}
-    virtual ~ConvolutionKernel_imad_byxf_af32_depthiwise() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
-    bool Validate(const Params& p, const optional_params& o) const override;
-    WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
-        return WeightsLayout::goiyx;
-    }
-    std::vector<FusedOpType> GetSupportedFusedOps() const override {
-        return { FusedOpType::ELTWISE,
-                 FusedOpType::QUANTIZE,
-                 FusedOpType::SCALE,
-                 FusedOpType::ACTIVATION };
-    }
-};
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad.cpp
deleted file mode 100644 (file)
index 5d96d9f..0000000
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
-// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_mmad.h"
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_mmad::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableInputDataType(Datatype::UINT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::UINT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::byxf_af32);
-    k.EnableOutputLayout(DataLayout::byxf_af32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableDilation();
-    k.EnableBiasPerFeature();
-    k.EnableBiasPerOutput();
-    k.EnableNonBiasTerm();
-    k.EnableBatching();
-    k.EnableSplitSupport();
-    k.EnableQuantization(QuantizationType::SYMMETRIC);
-    k.EnableDifferentInputWeightsTypes();
-    k.DisableTuning();
-    k.EnableDifferentTypes();
-    return k;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad::SetDefault(const convolution_params& arg, int) const {
-    DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
-    constexpr size_t sub_group_size = 8;
-
-    const auto of_maps = arg.output.Feature().v;
-    const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
-
-    runInfo.efficiency = FORCE_PRIORITY_4;
-
-    runInfo.gws0 = arg.output.X().v;
-    runInfo.gws1 = arg.output.Y().v;
-    runInfo.gws2 = of_threads_per_batch * arg.output.Batch().v;
-
-    runInfo.lws0 = 1;
-    runInfo.lws1 = 1;
-    runInfo.lws2 = sub_group_size;
-
-    return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad::GetJitConstants(const convolution_params& params,
-                                                     const DispatchData& runInfo) const {
-    auto jit = Parent::GetJitConstants(params, runInfo);
-
-    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
-
-    // pitch for special block format used in this kernel
-    const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
-    const size_t filter_ofm_block_pitch =
-        (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
-    jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
-
-    jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED"));
-    if (!params.fused_ops.empty()) {
-        auto input_dt = GetActivationType(params);
-        FusedOpsConfiguration conf_scalar = {"", {"b", "f", "y", "x"}, "res", input_dt, 1 };
-        jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
-    }
-    return jit;
-}
-
-KernelsData ConvolutionKernel_mmad::GetKernelsData(const Params& params, const optional_params& options) const {
-    KernelsData kd = GetTunedKernelsDataByIndex(params, options);
-    if (!kd.empty())
-        kd[0].estimatedTime = FORCE_PRIORITY_4;
-    return kd;
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad.h
deleted file mode 100644 (file)
index b5479ba..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad : public ConvolutionKernelBase {
-public:
-    using Parent = ConvolutionKernelBase;
-    ConvolutionKernel_mmad() : ConvolutionKernelBase("convolution_gpu_mmad") {}
-    virtual ~ConvolutionKernel_mmad() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
-    WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
-        return WeightsLayout::os_is_yx_isa8_osv8_isv4;
-    }
-    std::vector<FusedOpType> GetSupportedFusedOps() const override {
-        return { FusedOpType::ELTWISE,
-                 FusedOpType::QUANTIZE,
-                 FusedOpType::SCALE,
-                 FusedOpType::ACTIVATION };
-    }
-};
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.cpp
deleted file mode 100644 (file)
index aa41576..0000000
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_mmad_batched.h"
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_mmad_batched::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableDilation();
-    k.EnableBiasPerFeature();
-    k.EnableNonBiasTerm();
-    k.EnableBatching();
-    k.EnableSplitSupport();
-    k.EnableQuantization(QuantizationType::SYMMETRIC);
-    k.DisableTuning();
-    return k;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_batched::SetDefault(const convolution_params& arg,
-                                                                               int) const {
-    DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
-    constexpr size_t sub_group_size = 8;
-
-    const auto of_maps = arg.output.Feature().v;
-    const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
-
-    runInfo.efficiency = FORCE_PRIORITY_6;
-
-    runInfo.gws0 = arg.output.X().v;
-    runInfo.gws1 = arg.output.Y().v;
-    runInfo.gws2 = of_threads_per_batch * ((arg.output.Batch().v + 3) / 4);
-
-    runInfo.lws0 = 1;
-    runInfo.lws1 = 1;
-    runInfo.lws2 = sub_group_size;
-
-    return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad_batched::GetJitConstants(const convolution_params& params,
-                                                             const DispatchData& runInfo) const {
-    auto jit = Parent::GetJitConstants(params, runInfo);
-
-    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
-
-    // pitch for special block format used in this kernel
-    const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
-    const size_t filter_ofm_block_pitch =
-        (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
-    jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
-
-    const size_t in_x_pitch = 32 * 4;
-    const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
-    const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
-    const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
-    const size_t in_offset =
-        in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
-
-    jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
-    jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
-    jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
-    return jit;
-}
-
-KernelsData ConvolutionKernel_mmad_batched::GetKernelsData(const Params& params, const optional_params& options) const {
-    KernelsData kd = GetTunedKernelsDataByIndex(params, options);
-    if (!kd.empty())
-        kd[0].estimatedTime = FORCE_PRIORITY_6;
-    return kd;
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.h
deleted file mode 100644 (file)
index d0dd79d..0000000
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad_batched : public ConvolutionKernelBase {
-public:
-    using Parent = ConvolutionKernelBase;
-    ConvolutionKernel_mmad_batched() : ConvolutionKernelBase("convolution_gpu_mmad_batched") {}
-    virtual ~ConvolutionKernel_mmad_batched() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
-    WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
-        return WeightsLayout::os_is_yx_isa8_osv8_isv4;
-    }
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.cpp
deleted file mode 100644 (file)
index 60d7c23..0000000
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_mmad_batched_block.h"
-#include "kernel_selector_utils.h"
-#include <vector>
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_mmad_batched_block::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableBatching();
-    k.EnableQuantization(QuantizationType::SYMMETRIC);
-    k.DisableTuning();
-    return k;
-}
-
-struct block_params {
-    int32_t out_width;
-    int32_t out_height;
-    int32_t out_depth;
-};
-
-static block_params get_out_block_size(const convolution_params& p) {
-    if (p.filterSize.x == 3 && p.filterSize.y == 3) {
-        if (p.output.X().v == 7)
-            return {7, 1, 4};
-        else if (p.output.X().v == 14)
-            return {7, 1, 4};
-        else if (p.output.X().v == 28)
-            return {7, 1, 4};
-        else if (p.output.X().v == 56)
-            return {8, 1, 4};
-    }
-
-    return {1, 1, 1};
-}
-
-WeightsLayout ConvolutionKernel_mmad_batched_block::GetPreferredWeightsLayout(
-        const convolution_params &cp) const {
-    auto block = get_out_block_size(cp);
-    if (block.out_depth == 4)
-        return WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4;
-    else
-        return WeightsLayout::os_is_yx_isa8_osv8_isv4;
-}
-
-bool ConvolutionKernel_mmad_batched_block::Validate(const Params& p, const optional_params& o) const {
-    if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
-        return false;
-    }
-    const convolution_params& cp = static_cast<const convolution_params&>(p);
-
-    // if block sizes are 1x1, then this algorithm is probably not the best
-    auto block = get_out_block_size(cp);
-    if (block.out_width == 1 && block.out_height == 1)
-        return false;
-
-    if (cp.output.X().v % block.out_width != 0)
-        return false;
-    if (cp.output.Y().v % block.out_height != 0)
-        return false;
-
-    if (cp.filterSize.x == 1)
-        return false;
-
-    return true;
-}
-
-size_t static get_wg_batch_count(const convolution_params& params) {
-    if (params.inputs[0].Batch().v % 64 == 0)
-        return 16;  // because we process 4 batches per SIMD
-    return 1;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_batched_block::SetDefault(const convolution_params& arg,
-                                                                                     int) const {
-    DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
-    constexpr size_t sub_group_size = 8;
-
-    runInfo.efficiency = FORCE_PRIORITY_5;
-
-    auto block = get_out_block_size(arg);
-
-    runInfo.gws0 = arg.output.X().v / block.out_width;
-    runInfo.gws1 = arg.output.Y().v / block.out_height;
-    runInfo.gws2 = (arg.output.Feature().v) * ((arg.output.Batch().v + 3) / 4) /
-                   block.out_depth;  // process 4 output channels per Workitem
-
-    runInfo.lws0 = 1;
-    runInfo.lws1 = 1;
-    runInfo.lws2 = sub_group_size * get_wg_batch_count(arg);
-
-    return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad_batched_block::GetJitConstants(const convolution_params& params,
-                                                                   const DispatchData& runInfo) const {
-    auto jit = Parent::GetJitConstants(params, runInfo);
-
-    const int sub_group_size = 8;
-    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
-
-    // pitch for special block format used in this kernel
-    const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
-    const size_t filter_ofm_block_pitch =
-        (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
-    jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
-
-    const size_t in_x_pitch = 32 * 4;
-    const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
-    const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
-    const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
-    const size_t in_offset =
-        in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
-
-    jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
-    jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
-    jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
-
-    const size_t out_x_pitch = 32 * 4;
-    jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch));
-
-    auto block = get_out_block_size(params);
-    jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width));
-    jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block.out_height));
-    jit.AddConstant(MakeJitConstant("WEIGHTS_PER_WORKITEM", block.out_depth));
-
-    jit.AddConstant(MakeJitConstant("WG_BATCH_COUNT", get_wg_batch_count(params)));
-
-    return jit;
-}
-
-KernelsData ConvolutionKernel_mmad_batched_block::GetKernelsData(const Params& params,
-                                                                 const optional_params& options) const {
-    KernelsData kd = GetCommonKernelsData(params, options);
-    if (!kd.empty())
-        kd[0].estimatedTime = FORCE_PRIORITY_5;
-    return kd;
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.h
deleted file mode 100644 (file)
index abc58ad..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad_batched_block : public ConvolutionKernelBase {
-public:
-    using Parent = ConvolutionKernelBase;
-    ConvolutionKernel_mmad_batched_block() : ConvolutionKernelBase("convolution_gpu_mmad_batched_block") {}
-    virtual ~ConvolutionKernel_mmad_batched_block() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    bool Validate(const Params& p, const optional_params& o) const override;
-    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
-    WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.cpp
deleted file mode 100644 (file)
index 5b271b0..0000000
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_mmad_batched_block_1x1.h"
-#include "kernel_selector_utils.h"
-#include <vector>
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_mmad_batched_block_1x1::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableBatching();
-    k.EnableQuantization(QuantizationType::SYMMETRIC);
-    k.DisableTuning();
-    return k;
-}
-
-struct block_params {
-    int32_t out_width;
-    int32_t out_height;
-    int32_t out_depth;
-};
-
-static block_params get_out_block_size(const convolution_params& p) {
-    if (p.output.X().v == 7)
-        return {7, 1, 4};
-    else if (p.output.X().v == 14)
-        return {7, 1, 4};
-    else if (p.output.X().v == 28)
-        return {4, 2, 4};
-    else if (p.output.X().v == 56)
-        return {8, 1, 4};
-
-    return {1, 1, 1};
-}
-
-WeightsLayout ConvolutionKernel_mmad_batched_block_1x1::GetPreferredWeightsLayout(
-        const convolution_params &cp) const {
-    auto block = get_out_block_size(cp);
-    if (block.out_depth == 4)
-        return WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4;
-    else
-        return WeightsLayout::os_is_yx_isa8_osv8_isv4;
-}
-
-bool ConvolutionKernel_mmad_batched_block_1x1::Validate(const Params& p, const optional_params& o) const {
-    if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
-        return false;
-    }
-    const convolution_params& cp = static_cast<const convolution_params&>(p);
-
-    // only for conv 1x1
-    if (cp.filterSize.x != 1 || cp.filterSize.y != 1)
-        return false;
-
-    // only for stride 1x1
-    if (cp.stride.x != 1 || cp.stride.y != 1)
-        return false;
-
-    // if block sizes are 1x1, then this algorithm is probably not the best
-    auto block = get_out_block_size(cp);
-    if (block.out_depth != 4)
-        return false;
-
-    if (cp.output.X().v % block.out_width != 0)
-        return false;
-    if (cp.output.Y().v % block.out_height != 0)
-        return false;
-
-    return true;
-}
-
-size_t static get_wg_batch_count(const convolution_params& params) {
-    if (params.inputs[0].Batch().v % 64 == 0)
-        return 16;  // because we process 4 batches per SIMD
-    return 1;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_batched_block_1x1::SetDefault(const convolution_params& arg,
-                                                                                         int) const {
-    DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
-    constexpr size_t sub_group_size = 8;
-
-    runInfo.efficiency = FORCE_PRIORITY_3;
-
-    auto block = get_out_block_size(arg);
-
-    runInfo.gws0 = arg.output.X().v / block.out_width;
-    runInfo.gws1 = arg.output.Y().v / block.out_height;
-    runInfo.gws2 = (arg.output.Feature().v) * ((arg.output.Batch().v + 3) / 4) /
-                   block.out_depth;  // process 4 output channels per Workitem
-
-    runInfo.lws0 = 1;
-    runInfo.lws1 = 1;
-    runInfo.lws2 = sub_group_size * get_wg_batch_count(arg);
-
-    return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad_batched_block_1x1::GetJitConstants(const convolution_params& params,
-                                                                       const DispatchData& runInfo) const {
-    auto jit = Parent::GetJitConstants(params, runInfo);
-
-    const int sub_group_size = 8;
-    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
-
-    // pitch for special block format used in this kernel
-    const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
-    const size_t filter_ofm_block_pitch =
-        (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
-    jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
-
-    const size_t in_x_pitch = 32 * 4;
-    const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
-    const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
-    const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
-    const size_t in_offset =
-        in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
-
-    jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
-    jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
-    jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
-
-    const size_t out_x_pitch = 32 * 4;
-    const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded();
-
-    jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch));
-
-    auto block = get_out_block_size(params);
-    jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width));
-    jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block.out_height));
-    jit.AddConstant(MakeJitConstant("WEIGHTS_PER_WORKITEM", block.out_depth));
-
-    jit.AddConstant(MakeJitConstant("WG_BATCH_COUNT", get_wg_batch_count(params)));
-
-    return jit;
-}
-
-KernelsData ConvolutionKernel_mmad_batched_block_1x1::GetKernelsData(const Params& params,
-                                                                     const optional_params& options) const {
-    KernelsData kd = GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char");
-    if (!kd.empty())
-        kd[0].estimatedTime = FORCE_PRIORITY_3;
-    return kd;
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.h
deleted file mode 100644 (file)
index 78db7d9..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad_batched_block_1x1 : public ConvolutionKernelBase {
-public:
-    using Parent = ConvolutionKernelBase;
-    ConvolutionKernel_mmad_batched_block_1x1() : ConvolutionKernelBase("convolution_gpu_mmad_batched_block_1x1") {}
-    virtual ~ConvolutionKernel_mmad_batched_block_1x1() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    bool Validate(const Params& p, const optional_params& o) const override;
-    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
-    WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
index 2aef384..f9110ac 100644 (file)
@@ -35,7 +35,6 @@ ParamsKey ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv4::GetSupportedKey() const {
 
     k.EnableInputLayout(DataLayout::bfyx);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
-    k.EnableOutputLayout(DataLayout::byxf_af32);
     k.EnableTensorOffset();
     k.EnableTensorPitches();
     k.EnableDilation();
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_blocks.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_blocks.cpp
deleted file mode 100644 (file)
index 90ff761..0000000
+++ /dev/null
@@ -1,270 +0,0 @@
-// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "convolution_kernel_mmad_blocks.h"
-#include <vector>
-#include <utility>
-#include <string>
-#include <algorithm>
-
-namespace kernel_selector {
-ConvolutionKernel_mmad_blocks::ConvolutionKernel_mmad_blocks() : ConvolutionKernelBase("convolution_gpu_mmad_blocks") {
-    // Generate the dispatch options to the auto-tuner.
-    std::vector<size_t> blockWidthSizes = {1, 2, 4, 5, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32};
-    std::vector<size_t> blockHeightSizes = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-    std::vector<size_t> prefetchSizes = {1, 2, 3, 4, 5, 6, 8, 10};
-    std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
-    const size_t maxBlockSize = 240;
-    for (auto executionMode : executionModes) {
-        for (auto blockWidth : blockWidthSizes) {
-            for (auto blockHeight : blockHeightSizes) {
-                for (auto prefetch : prefetchSizes) {
-                    if (blockWidth * blockHeight <= maxBlockSize) {
-                        autoTuneOptions.emplace_back(AutoTuneOption{blockWidth, blockHeight, prefetch, executionMode});
-                    }
-                }
-            }
-        }
-    }
-}
-
-ParamsKey ConvolutionKernel_mmad_blocks::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableInputDataType(Datatype::UINT8);
-
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::UINT8);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::F16);
-
-    k.EnableInputWeightsType(WeightsType::INT8);
-
-    k.EnableInputLayout(DataLayout::byxf_af32);
-    k.EnableOutputLayout(DataLayout::byxf_af32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-//    k.EnableDilation(); TODO: Add dilation support
-    k.EnableBiasPerFeature();
-    k.EnableBiasPerOutput();
-    k.EnableNonBiasTerm();
-    k.EnableBatching();
-    k.EnableSplitSupport();
-    k.EnableQuantization(QuantizationType::SYMMETRIC);
-    k.EnableDifferentTypes();
-    k.EnableDifferentInputWeightsTypes();
-    k.DisableTuning();
-    return k;
-}
-
-bool ConvolutionKernel_mmad_blocks::Validate(const Params& p, const optional_params& o) const {
-    if (!Parent::Validate(p, o)) {
-        return false;
-    }
-
-    return true;
-}
-
-static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_t& block_x, size_t& block_y) {
-    // how many elements we will compute in each dimension
-    size_t computed_x = Align(output_x, block_x);
-    size_t computed_y = Align(output_y, block_y);
-    // how many simds we need in each dimension
-    size_t simds_x = computed_x / block_x;
-    size_t simds_y = computed_y / block_y;
-    // how many unused values we have in each dimension
-    size_t unused_x = computed_x - output_x;
-    size_t unused_y = computed_y - output_y;
-
-    block_x -= unused_x / simds_x;
-    block_y -= unused_y / simds_y;
-}
-
-ConvolutionKernel_mmad_blocks::AutoTuneOption ConvolutionKernel_mmad_blocks::GetAutoTuneOptions(
-    const Params& p,
-    int autoTuneIndex) const {
-    if ((autoTuneIndex >= 0) && (autoTuneIndex < static_cast<int>(autoTuneOptions.size()))) {
-        return autoTuneOptions[autoTuneIndex];
-    }
-
-    // Sub-group size used by "convolution_gpu_mmad_blocks" kernel.
-    constexpr size_t sub_group_size = 16;
-
-    AutoTuneOption option = {0, 0, 0, DEFAULT};
-
-    const convolution_params& cp = static_cast<const convolution_params&>(p);
-
-    if (cp.stride.x == 1 && cp.stride.y == 1) {
-        if (cp.filterSize.x == 1 && cp.filterSize.y == 1) {
-            option.blockWidth = 16;
-            option.blockHeight = 1;
-            option.prefetch = 4;
-        // if less than 16 values is required to compute one single row of output
-        // then each WI shall compute one single row to maximize reuse within SIMD subgroup (this gives very nice
-        // performance results)
-        } else if (cp.output.X().v + (cp.filterSize.x - 1) * cp.dilation.x < sub_group_size) {
-            option.blockWidth = cp.output.X().v;
-            option.blockHeight = 1;
-            option.prefetch = 4;
-        } else if (cp.filterSize.x < 5 && cp.filterSize.y < 5) {
-            option.blockWidth = sub_group_size - cp.filterSize.x + 1;
-            option.blockHeight = 2;
-            option.prefetch = 4;
-        } else {
-            option.blockWidth = 4;
-            option.blockHeight = 3;
-            option.prefetch = 4;
-        }
-    } else if (cp.stride.x == 2 && cp.stride.y == 2) {
-        option.blockWidth = 5;
-        option.blockHeight = 4;
-        option.prefetch = 4;
-    } else {
-        option.blockWidth = 4;
-        option.blockHeight = 3;
-        option.prefetch = 5;
-        // run_info.efficiency = FORCE_PRIORITY_7; // GEMM is better
-    }
-
-    // if this is not 1x1 batch1 case then shrink filters, other way we're memory bound and it's best to use 16x1 block
-    // sizes
-    if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1) {
-        shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v, option.blockWidth, option.blockHeight);
-    }
-
-    return option;
-}
-
-static std::pair<size_t, size_t> get_byxf_af32_req_input_block_dims(size_t output_block_width,
-                                                                    size_t output_block_height,
-                                                                    const uSize& filter_size,
-                                                                    const uSize& stride,
-                                                                    const uSize& dilation,
-                                                                    size_t sub_group_size = 8,
-                                                                    size_t read_chunk_size = 8,
-                                                                    size_t min_read_size = 8) {
-    assert(output_block_width > 0 && output_block_height > 0);
-    assert(stride.x > 0 && stride.y > 0);
-    assert(filter_size.x > 0 && filter_size.y > 0);
-
-    // Number of elements in X dimension needed from input to compute output block without re-reading input.
-    size_t input_block_req_width = (output_block_width - 1) * stride.x + (filter_size.x - 1) * dilation.x + 1;
-    // Number of elements in Y dimension needed from input to compute output block without re-reading input.
-    size_t input_block_req_height = (output_block_height - 1) * stride.y + (filter_size.y - 1) * dilation.y + 1;
-
-    // Required number of elements in X dimension rounded to nearest >= read chunk size.
-    size_t input_block_read_width = std::max(RoundUp(input_block_req_width, read_chunk_size), min_read_size);
-    // Number of sub-group-sized vectors of unit type needed to store input block.
-    size_t input_block_array_size = CeilDiv(input_block_req_height * input_block_read_width, sub_group_size);
-
-    // size of our array per workitem
-    input_block_array_size = input_block_req_height * input_block_read_width;
-    return std::make_pair(input_block_array_size, input_block_read_width);
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_blocks::SetDefault(const convolution_params& cp,
-                                                                              int autoTuneIndex) const {
-    // Sub-group size used by "convolution_gpu_mmad_blocks" kernel.
-    constexpr size_t sub_group_size = 8;
-
-    DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
-
-    auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex);
-    runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth;
-    runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight;
-    runInfo.cldnnStyle.prefetch = tuneOptions.prefetch;
-
-    auto input_block_dims =
-        get_byxf_af32_req_input_block_dims(runInfo.cldnnStyle.blockWidth,
-                                           runInfo.cldnnStyle.blockHeight,
-                                           cp.filterSize,
-                                           cp.stride,
-                                           cp.dilation,
-                                           sub_group_size,
-                                           runInfo.fp16UnitUsed ? sub_group_size : sub_group_size / 2,
-                                           sub_group_size);
-    runInfo.cldnnStyle.inputBlockArraySize = input_block_dims.first;
-    runInfo.cldnnStyle.inputBlockWidth = input_block_dims.second;
-
-    const auto of_maps = cp.output.Feature().v;
-    const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
-
-    runInfo.efficiency = FORCE_PRIORITY_3;
-
-    runInfo.gws0 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth);
-    runInfo.gws1 = CeilDiv(cp.output.Y().v, runInfo.cldnnStyle.blockHeight);
-    runInfo.gws2 = of_threads_per_batch * cp.output.Batch().v;
-
-    runInfo.lws0 = 1;
-    runInfo.lws1 = 1;
-    runInfo.lws2 = sub_group_size;
-
-    return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad_blocks::GetJitConstants(const convolution_params& params,
-                                                            const DispatchData& runInfo) const {
-    auto jit = Parent::GetJitConstants(params, runInfo);
-
-    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
-    jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", runInfo.cldnnStyle.blockWidth));
-    jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", runInfo.cldnnStyle.blockHeight));
-    jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", runInfo.cldnnStyle.inputBlockArraySize));
-    jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", runInfo.cldnnStyle.inputBlockWidth));
-    jit.AddConstant(MakeJitConstant("PREFETCH", runInfo.cldnnStyle.prefetch));
-
-    jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED"));
-
-    // pitch for special block format used in this kernel
-    const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
-    const size_t filter_ofm_block_pitch =
-        (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
-    jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
-
-    if (!params.fused_ops.empty()) {
-        auto input_dt = GetActivationType(params);
-        FusedOpsConfiguration conf_scalar = {"", {"b", "f", "(y+br)", "(x+bc)"}, "res", input_dt, 1 };
-        jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
-    }
-
-    return jit;
-}
-
-KernelsData ConvolutionKernel_mmad_blocks::GetKernelsData(const Params& params, const optional_params& options) const {
-    KernelsData kd = GetTunedKernelsDataByIndex(params, options);
-    if (!kd.empty())
-        kd[0].estimatedTime = FORCE_PRIORITY_2;
-
-    return kd;
-}
-
-KernelsData ConvolutionKernel_mmad_blocks::GetKernelsDataForAutoTune(const Params& params,
-                                                                     const optional_params& options) const {
-    if (!Validate(params, options)) {
-        return {};
-    }
-
-    KernelsData res = {};
-
-    for (size_t i = 0; i < autoTuneOptions.size(); i++) {
-        KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
-        if (!kd.empty()) {
-            res.emplace_back(kd[0]);
-        }
-    }
-
-    return res;
-}
-
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_blocks.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_blocks.h
deleted file mode 100644 (file)
index 04287f9..0000000
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <string>
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad_blocks : public ConvolutionKernelBase {
-public:
-    using Parent = ConvolutionKernelBase;
-    ConvolutionKernel_mmad_blocks();
-    virtual ~ConvolutionKernel_mmad_blocks() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    KernelsData GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    bool Validate(const Params& p, const optional_params& o) const override;
-    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
-    WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
-        return WeightsLayout::os_is_yx_isa8_osv8_isv4;
-    }
-    std::vector<FusedOpType> GetSupportedFusedOps() const override {
-        return { FusedOpType::ELTWISE,
-                 FusedOpType::QUANTIZE,
-                 FusedOpType::SCALE,
-                 FusedOpType::ACTIVATION };
-    }
-
-private:
-    struct AutoTuneOption {
-        size_t blockWidth;
-        size_t blockHeight;
-        size_t prefetch;
-        std::string exeMode;
-    };
-
-    AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const;
-    std::vector<AutoTuneOption> autoTuneOptions = {};
-};
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.cpp
deleted file mode 100644 (file)
index 6eb2dad..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
-// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_mmad_slm_2x14_rep4.h"
-#include "kernel_selector_utils.h"
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_mmad_slm_2x14_rep4::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableBiasPerOutput();
-    k.EnableNonBiasTerm();
-    k.EnableBatching();
-    k.EnableQuantization(QuantizationType::SYMMETRIC);
-    k.DisableTuning();
-    return k;
-}
-
-bool ConvolutionKernel_mmad_slm_2x14_rep4::Validate(const Params& p, const optional_params& o) const {
-    if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
-        return false;
-    }
-
-    const convolution_params& cp = static_cast<const convolution_params&>(p);
-
-    if (cp.filterSize.x != 3 || cp.filterSize.y != 3)
-        return false;
-
-    if (cp.inputs[0].X().v != 56 || cp.inputs[0].Y().v != 56)
-        return false;
-
-    if (cp.stride.x != 1 || cp.stride.y != 1)
-        return false;
-
-    return true;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_slm_2x14_rep4::SetDefault(const convolution_params& arg,
-                                                                                     int) const {
-    DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
-    runInfo.efficiency = FORCE_PRIORITY_1;
-
-    const size_t rep_count = 4;
-    const size_t batch_per_wi = 1;
-    const size_t out_block_width = 14;
-    const size_t out_block_height = 2;
-    runInfo.gws0 = arg.output.Feature().v *
-                   (arg.output.Batch().v / (rep_count * batch_per_wi));  // number of tiles needed to cover output width
-    runInfo.gws1 = ((arg.inputs[0].X().v / arg.stride.x) + (out_block_width - 1)) / out_block_width;
-    runInfo.gws2 = ((arg.inputs[0].Y().v / arg.stride.y) + (out_block_height - 1)) / out_block_height;
-
-    runInfo.lws0 = 32;  // depth
-    runInfo.lws1 = 1;   // width
-    runInfo.lws2 = 4;   // height
-
-    return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad_slm_2x14_rep4::GetJitConstants(const convolution_params& params,
-                                                                   const DispatchData& runInfo) const {
-    auto jit = ConvolutionKernelBase::GetJitConstants(params, runInfo);
-
-    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 8));
-
-    // pitch for special block format used in this kernel
-    const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
-    const size_t filter_ofm_block_pitch =
-        (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
-    jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
-
-    const size_t in_x_pitch = 32 * 4;
-    const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
-    const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
-    const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
-    const size_t in_offset =
-        in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
-
-    jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
-    jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
-    jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
-
-    jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", 14));
-    jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", 2));
-    jit.AddConstant(MakeJitConstant("LOCAL_SIZE_X", runInfo.lws0));
-    jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Y", runInfo.lws1));
-    jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Z", runInfo.lws2));
-
-    return jit;
-}
-
-KernelsData ConvolutionKernel_mmad_slm_2x14_rep4::GetKernelsData(const Params& params,
-                                                                 const optional_params& options) const {
-    return GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char");
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.h
deleted file mode 100644 (file)
index a8fdfc3..0000000
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad_slm_2x14_rep4 : public ConvolutionKernelBase {
-public:
-    ConvolutionKernel_mmad_slm_2x14_rep4() : ConvolutionKernelBase("convolution_gpu_mmad_slm_2x14_rep4") {}
-    virtual ~ConvolutionKernel_mmad_slm_2x14_rep4() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
-    bool Validate(const Params& p, const optional_params& o) const override;
-    WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
-        return WeightsLayout::os_is_yx_isa8_osv8_isv4;
-    }
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.cpp
deleted file mode 100644 (file)
index ca4cf6e..0000000
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
-// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_mmad_slm_7x7_rep4.h"
-#include "kernel_selector_utils.h"
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_mmad_slm_7x7_rep4::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableBiasPerOutput();
-    k.EnableNonBiasTerm();
-    k.EnableBatching();
-    k.EnableQuantization(QuantizationType::SYMMETRIC);
-    k.DisableTuning();
-    return k;
-}
-
-bool ConvolutionKernel_mmad_slm_7x7_rep4::Validate(const Params& p, const optional_params& o) const {
-    if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
-        return false;
-    }
-
-    const convolution_params& cp = static_cast<const convolution_params&>(p);
-
-    if (cp.filterSize.x != 3 || cp.filterSize.y != 3)
-        return false;
-
-    if (cp.stride.x != 1 || cp.stride.y != 1)
-        return false;
-
-    if (cp.inputs[0].X().v == 7 && cp.inputs[0].Y().v == 7)
-        return true;
-
-    if (cp.inputs[0].X().v == 14 && cp.inputs[0].Y().v == 14)
-        return true;
-
-    return false;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_slm_7x7_rep4::SetDefault(const convolution_params& arg,
-                                                                                    int) const {
-    DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
-    runInfo.efficiency = FORCE_PRIORITY_1;
-
-    const size_t rep_count = 4;
-    const size_t batch_per_wi = 4;
-    const size_t out_block_width = 7;
-    // const size_t out_block_height = 1;
-    runInfo.gws0 = (arg.output.Feature().v * arg.output.Batch().v) /
-                   (rep_count * batch_per_wi);  // number of tiles needed to cover output width
-    runInfo.gws1 = ((arg.inputs[0].X().v / arg.stride.x) + (out_block_width - 1)) / out_block_width;
-    // since this kernel only apply to 7x7 sizes we need to manually set gws2 to 8
-    runInfo.gws2 = Align(arg.inputs[0].Y().v,
-                         8);  // 8;//((arg.inputs[0].Y().v / arg.stride.y) + (out_block_height - 1)) / out_block_height;
-
-    runInfo.lws0 = 16;  // depth
-    runInfo.lws1 = 1;   // width
-    runInfo.lws2 = 8;   // height
-
-    return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad_slm_7x7_rep4::GetJitConstants(const convolution_params& params,
-                                                                  const DispatchData& runInfo) const {
-    auto jit = ConvolutionKernelBase::GetJitConstants(params, runInfo);
-
-    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 8));
-
-    // pitch for special block format used in this kernel
-    const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
-    const size_t filter_ofm_block_pitch =
-        (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
-    jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
-
-    const size_t in_x_pitch = 32 * 4;
-    const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
-    const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
-    const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
-    const size_t in_offset =
-        in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
-
-    const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded();
-
-    jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
-    jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
-    jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
-
-    jit.AddConstant(MakeJitConstant("OUT_X_PITCH", in_x_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", 7));
-    jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", 1));
-    jit.AddConstant(MakeJitConstant("LOCAL_SIZE_X", runInfo.lws0));
-    jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Y", runInfo.lws1));
-    jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Z", 7));  // must be 7 since we process 7 in Y per workgroup
-
-    return jit;
-}
-
-KernelsData ConvolutionKernel_mmad_slm_7x7_rep4::GetKernelsData(const Params& params,
-                                                                const optional_params& options) const {
-    return GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char");
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.h
deleted file mode 100644 (file)
index 5f766c7..0000000
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad_slm_7x7_rep4 : public ConvolutionKernelBase {
-public:
-    ConvolutionKernel_mmad_slm_7x7_rep4() : ConvolutionKernelBase("convolution_gpu_mmad_slm_7x7_rep4") {}
-    virtual ~ConvolutionKernel_mmad_slm_7x7_rep4() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
-    bool Validate(const Params& p, const optional_params& o) const override;
-    WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
-        return WeightsLayout::os_is_yx_isa8_osv8_isv4;
-    }
-};
-}  // namespace kernel_selector
\ No newline at end of file
index e6a954d..e002c9f 100644 (file)
@@ -33,9 +33,6 @@
 #include "convolution_kernel_bfyx_1x1_gemm_buf.h"
 #include "convolution_kernel_winograd_2x3_s1_fused.h"
 #include "convolution_kernel_winograd_6x3_s1_fused.h"
-#include "convolution_kernel_mmad.h"
-#include "convolution_kernel_mmad_blocks.h"
-#include "convolution_kernel_imad_byxf_af32_depthwise.h"
 #include "convolution_kernel_bfyx_depthwise_weights_lwg.h"
 #include "convolution_kernel_imad.h"
 #include "convolution_kernel_fs_byx_fsv32.h"
@@ -51,7 +48,6 @@
 #include "deformable_convolution_kernel_bfyx_interp.h"
 #include "convolution_kernel_b_fs_zyx_fsv16_fp32.h"
 #include "convolution_kernel_b_fs_zyx_fsv16_fp16.h"
-#include "convolution_kernel_imad_byxf_af32_1x1.h"
 #include "convolution_kernel_imad_b_fs_yx_fsv4_1x1.h"
 #include "convolution_kernel_imad_b_fs_yx_fsv4_dw.hpp"
 #include "convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv4.h"
@@ -119,12 +115,6 @@ convolution_kernel_selector::convolution_kernel_selector() {
     Attach<ConvolutionKernel_Winograd_2x3_s1_fused>();
     Attach<ConvolutionKernel_Winograd_6x3_s1_fused>();
 
-    // byxf_af32 int8
-    Attach<ConvolutionKernel_mmad>();
-    Attach<ConvolutionKernel_mmad_blocks>();
-    Attach<ConvolutionKernel_imad_byxf_af32_1x1>();
-    Attach<ConvolutionKernel_imad_byxf_af32_depthiwise>();
-
     // b_fs_yx_fsv4 kernels
     Attach<ConvolutionKernel_imad>();
     Attach<ConvolutionKernel_imad_b_fs_yx_fsv4_1x1>();
index 8eae48e..e7e49ab 100644 (file)
@@ -55,9 +55,6 @@ ParamsKey DeconvolutionKernel_imad_along_f_tile_bfx::GetSupportedKey() const {
     k.EnableInputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16);
     k.EnableOutputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16);
 
-    k.EnableInputLayout(DataLayout::byxf_af32);
-    k.EnableOutputLayout(DataLayout::byxf_af32);
-
     k.EnableDifferentTypes();
     k.EnableDifferentInputWeightsTypes();
     k.EnableBatching();
@@ -179,9 +176,6 @@ JitConstants DeconvolutionKernel_imad_along_f_tile_bfx::GetJitConstants(const de
             input_tile_ifm_pitch = zyx_pitch_factor * 16 * 16;
         }
         input_in_tile_batch_pitch = 16;
-    } else if (in_layout == DataLayout::byxf_af32) {
-        input_tile_ifm_pitch = tile_ifm;
-        input_in_tile_batch_pitch = zyx_pitch_factor * Align(in.Feature().LogicalDimPadded(), 32);
     }
 
     jit.AddConstant(MakeJitConstant("INPUT_VALID_TILE_IFM_PITCH", input_tile_ifm_pitch != 0));
@@ -242,8 +236,7 @@ size_t DeconvolutionKernel_imad_along_f_tile_bfx::GetTileIFM(const deconvolution
         fsv = 16;
     }
     if (params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv32
-        || params.inputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32
-        || params.inputs[0].GetLayout() == DataLayout::byxf_af32) {
+        || params.inputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32) {
         fsv = 32;
     }
 
index 840d83a..da9b46f 100644 (file)
@@ -41,7 +41,6 @@ ParamsKey DeconvolutionKernel_imad_ref::GetSupportedKey() const {
     k.EnableInputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16);
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
     k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
-    k.EnableInputLayout(DataLayout::byxf_af32);
     k.EnableAllOutputLayout();
 
     k.EnableDifferentTypes();
index 52a3790..63021ed 100644 (file)
@@ -43,14 +43,12 @@ bool EltwiseKernel_vload8::Validate(const Params& params, const optional_params&
     const auto& ewParams = static_cast<const eltwise_params&>(params);
 
     for (size_t i = 0; i < ewParams.inputs.size(); i++) {
-        if (ewParams.inputs[i].GetLayout() == DataLayout::fs_bs_yx_bsv4_fsv32 ||
-            (ewParams.inputs[i].GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) ||
+        if ((ewParams.inputs[i].GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) ||
             (ewParams.inputs[i].GetLayout() == DataLayout::b_fs_zyx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) ||
             ewParams.inputs[i].GetLayout() == DataLayout::fs_b_yx_fsv32)
             return false;
     }
-    if (ewParams.output.GetLayout() == DataLayout::fs_bs_yx_bsv4_fsv32 ||
-       (ewParams.output.GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
+    if ((ewParams.output.GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
        (ewParams.output.GetLayout() == DataLayout::b_fs_zyx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
         ewParams.output.GetLayout() == DataLayout::fs_b_yx_fsv32)
         return false;
index 441a3a4..ceb6dc1 100644 (file)
@@ -37,7 +37,6 @@ ParamsKey FullyConnectedKernelMMAD::GetSupportedKey() const {
     k.EnableDifferentTypes();
 
     k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableInputLayout(DataLayout::byxf_af32);
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
     k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
     k.EnableOutputLayout(DataLayout::bf);
@@ -129,7 +128,7 @@ JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_par
     size_t input_y_pitch = input.Y().pitch;
     size_t input_z_pitch = input.Z().pitch;
 
-    if (input.GetLayout() == DataLayout::byxf_af32 || input.GetLayout() == DataLayout::bfyx) {
+    if (input.GetLayout() == DataLayout::bfyx) {
         jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", 32));
     } else if (input.GetLayout() == DataLayout::b_fs_yx_fsv32 || input.GetLayout() == DataLayout::b_fs_zyx_fsv32) {
         input_x_pitch = 32;
index c5fa8ca..d7821a6 100644 (file)
@@ -136,7 +136,7 @@ PoolingKernelBase::DispatchData PoolingKernelBase::SetDefault(const pooling_para
     kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
 
     if (output.GetLayout() == DataLayout::bfyx || output.GetLayout() == DataLayout::b_fs_yx_fsv4 ||
-        output.GetLayout() == DataLayout::byxf || output.GetLayout() == DataLayout::byxf_af32 ||
+        output.GetLayout() == DataLayout::byxf ||
         output.GetLayout() == DataLayout::bfzyx || output.GetLayout() == DataLayout::b_fs_zyx_fsv16 ||
         output.GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv16) {
         // Determine global work sizes.
index 6060236..6375f73 100644 (file)
@@ -27,7 +27,6 @@ ParamsKey PoolingKerneGPU_b_fs_yx_fsv4::GetSupportedKey() const {
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
     k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::byxf_af32);
     k.EnableTensorOffset();
     k.EnableTensorPitches();
     k.EnableBatching();
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.cpp
deleted file mode 100644 (file)
index 70d1d65..0000000
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright (c) 2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "pooling_kernel_gpu_byxf_af32.h"
-
-namespace kernel_selector {
-ParamsKey PoolingKerneGPU_byxf_af32::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableInputDataType(Datatype::UINT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::UINT8);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputLayout(DataLayout::byxf_af32);
-    k.EnableOutputLayout(DataLayout::byxf_af32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBatching();
-    k.EnablePoolType(PoolType::MAX);
-    k.EnablePoolType(PoolType::AVG);
-    k.EnablePoolRemainder(PoolRemainder::FLOOR);
-    k.EnablePoolRemainder(PoolRemainder::CEIL);
-    k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED);
-    k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC);
-    k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC_WITH_PADDING);
-    k.EnableDifferentTypes();
-    return k;
-}
-
-PoolingKernelBase::DispatchData PoolingKerneGPU_byxf_af32::SetDefault(const pooling_params& params) const {
-    constexpr int simdSize = 8;
-
-    DispatchData runInfo = PoolingKernelBase::SetDefault(params);
-
-    runInfo.gws0 = params.output.X().v;
-    runInfo.gws1 = params.output.Y().v;
-    // we got byxf_af32 format, so if we process 4 features per workitem, that means we process 32 per simd, so divide
-    // by 4 and we end up with 8
-    runInfo.gws2 = (RoundUp(params.output.Feature().v, 32) * params.output.Batch().v) / 4;
-
-    runInfo.lws0 = 1;
-    runInfo.lws1 = 1;
-    runInfo.lws2 = simdSize;
-
-    return runInfo;
-}
-
-JitConstants PoolingKerneGPU_byxf_af32::GetJitConstants(const pooling_params& params, DispatchData kd) const {
-    JitConstants jit = PoolingKernelBase::GetJitConstants(params, kd);
-
-    jit.AddConstant(MakeJitConstant("AS_INPUT_TYPE(val)", "as_" + toCLType(params.inputs[0].GetDType()) + "4(val)"));
-    jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
-    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
-
-    if (!params.fused_ops.empty()) {
-        auto input_dt = GetActivationType(params);
-        FusedOpsConfiguration conf = {"",
-                                     {"b", "f", "y", "x"},
-                                     "fused_pool_result",
-                                     input_dt,
-                                     4,
-                                     LoadType::LT_UNALIGNED,
-                                     BoundaryCheck::ENABLED,
-                                     IndexType::TENSOR_COORD,
-                                     Tensor::DataChannelName::FEATURE};
-        jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
-    }
-
-    return jit;
-}
-
-
-KernelsData PoolingKerneGPU_byxf_af32::GetKernelsData(const Params& params, const optional_params& options) const {
-    return GetCommonKernelsData(params, options, FORCE_PRIORITY_1);
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h
deleted file mode 100644 (file)
index 1ffc94b..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "pooling_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-class PoolingKerneGPU_byxf_af32 : public PoolingKernelBase {
-public:
-    PoolingKerneGPU_byxf_af32() : PoolingKernelBase("pooling_gpu_byxf_af32") {}
-    virtual ~PoolingKerneGPU_byxf_af32() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-    JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
-    DispatchData SetDefault(const pooling_params& params) const override;
-    std::vector<FusedOpType> GetSupportedFusedOps() const override {
-        return { FusedOpType::ELTWISE,
-                 FusedOpType::QUANTIZE,
-                 FusedOpType::SCALE,
-                 FusedOpType::ACTIVATION };
-    }
-};
-}  // namespace kernel_selector
index beedfe9..9df0eba 100644 (file)
@@ -29,7 +29,6 @@ ParamsKey PoolingKernelGPUInt8Ref::GetSupportedKey() const {
     k.EnableInputLayout(DataLayout::bfzyx);
     k.EnableInputLayout(DataLayout::yxfb);
     k.EnableInputLayout(DataLayout::byxf);
-    k.EnableInputLayout(DataLayout::byxf_af32);
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
     k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
@@ -38,7 +37,6 @@ ParamsKey PoolingKernelGPUInt8Ref::GetSupportedKey() const {
     k.EnableOutputLayout(DataLayout::bfzyx);
     k.EnableOutputLayout(DataLayout::yxfb);
     k.EnableOutputLayout(DataLayout::byxf);
-    k.EnableOutputLayout(DataLayout::byxf_af32);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
     k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32);
index 27305ac..263f63a 100644 (file)
@@ -18,7 +18,6 @@
 #include "pooling_kernel_gpu_byxf_opt.h"
 #include "pooling_kernel_gpu_bfyx_block_opt.h"
 #include "pooling_kernel_gpu_byxf_padding_opt.h"
-#include "pooling_kernel_gpu_byxf_af32.h"
 #include "pooling_kernel_gpu_int8_ref.h"
 #include "pooling_kernel_gpu_b_fs_yx_fsv4.h"
 #include "pooling_kernel_gpu_fs_b_yx_fsv32.h"
@@ -35,7 +34,6 @@ pooling_kernel_selector::pooling_kernel_selector() {
     Attach<PoolingKernelGPUBfyxBlockOpt>();
     Attach<PoolingKernelGPUByxfPaddingOpt>();
     Attach<PoolingKernelGPUInt8Ref>();
-    Attach<PoolingKerneGPU_byxf_af32>();
     Attach<PoolingKerneGPU_b_fs_yx_fsv4>();
     Attach<PoolingKerneGPU_fs_b_yx_fsv32>();
     Attach<PoolingKernel_b_fs_yx_fsv16>();
index 51a9600..6b52142 100644 (file)
@@ -40,14 +40,12 @@ ParamsKey QuantizeKernelScaleShift::GetSupportedKey() const {
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
     k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
-    k.EnableInputLayout(DataLayout::byxf_af32);
     k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
     k.EnableOutputLayout(DataLayout::bfyx);
     k.EnableOutputLayout(DataLayout::yxfb);
     k.EnableOutputLayout(DataLayout::bfzyx);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
     k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
-    k.EnableOutputLayout(DataLayout::byxf_af32);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
     k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32);
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.cpp
deleted file mode 100644 (file)
index b97e7e2..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "reorder_kernel_byxf_f32_to_byx8_f4_i8.h"
-#include "kernel_selector_utils.h"
-
-namespace kernel_selector {
-ParamsKey reorder_kernel_byxf_f32_to_byx8_f4_i8::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableDifferentTypes();
-    k.EnableInputLayout(DataLayout::byxf);
-    k.EnableOutputLayout(DataLayout::byx8_f4);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBatching();
-    return k;
-}
-
-bool reorder_kernel_byxf_f32_to_byx8_f4_i8::Validate(const Params& p, const optional_params& o) const {
-    if (!ReorderKernelBase::Validate(p, o)) {
-        return false;
-    }
-
-    const reorder_params& params = static_cast<const reorder_params&>(p);
-
-    if (params.output.X().v % 16 != 0)
-        return false;
-
-    if (params.inputs[0].Feature().v != 3)
-        return false;
-
-    if (params.mode == MeanSubtractMode::IN_BUFFER && params.mean.LogicalSize() != params.inputs[0].Feature().v)
-        return false;
-
-    return true;
-}
-
-size_t static get_wg_batch_size(const reorder_params& params) {
-    if (params.inputs[0].Batch().v % 16 == 0)
-        return 16;
-    return 1;
-}
-
-reorder_kernel_byxf_f32_to_byx8_f4_i8::DispatchData reorder_kernel_byxf_f32_to_byx8_f4_i8::SetDefault(
-    const reorder_params& params) const {
-    DispatchData kd;
-
-    const auto& input = params.inputs[0];
-
-    kd.gws0 = input.X().v;
-    kd.gws1 = input.Y().v;
-    kd.gws2 = input.Batch().v;
-
-    kd.lws0 = 16;
-    kd.lws1 = 1;
-    kd.lws2 = get_wg_batch_size(params);
-
-    return kd;
-}
-
-JitConstants reorder_kernel_byxf_f32_to_byx8_f4_i8::GetJitConstants(const reorder_params& params) const {
-    auto jit = ReorderKernelBase::GetJitConstants(params);
-    jit.Merge(GetTensorFriendlyWorkGroupsJit(params.inputs[0]));
-    jit.AddConstant(MakeJitConstant("WG_BATCH_SIZE", get_wg_batch_size(params)));
-    return jit;
-}
-
-KernelsData reorder_kernel_byxf_f32_to_byx8_f4_i8::GetKernelsData(const Params& params,
-                                                                  const optional_params& options) const {
-    const reorder_params& orgParams = static_cast<const reorder_params&>(params);
-    return GetCommonKernelsData(orgParams, options, FORCE_PRIORITY_5);
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.h
deleted file mode 100644 (file)
index 032ff8f..0000000
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "reorder_kernel_base.h"
-
-namespace kernel_selector {
-class reorder_kernel_byxf_f32_to_byx8_f4_i8 : public ReorderKernelBase {
-public:
-    reorder_kernel_byxf_f32_to_byx8_f4_i8() : ReorderKernelBase("reorder_data_byxf_f32_to_byx8_f4_i8") {}
-    virtual ~reorder_kernel_byxf_f32_to_byx8_f4_i8() {}
-
-    bool Validate(const Params& p, const optional_params& o) const override;
-    DispatchData SetDefault(const reorder_params& params) const override;
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-    JitConstants GetJitConstants(const reorder_params& params) const override;
-};
-}  // namespace kernel_selector
index 376f591..9bdc21c 100644 (file)
@@ -33,7 +33,6 @@ ParamsKey ReorderKernelFastBatch1::GetSupportedKey() const {
     k.EnableInputLayout(DataLayout::bfwzyx);
     k.EnableInputLayout(DataLayout::bs_f_bsv8__af8);
     k.EnableInputLayout(DataLayout::bs_f_bsv16__af8);
-    k.EnableInputLayout(DataLayout::bf8_xy16);
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
     k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
 
@@ -45,7 +44,6 @@ ParamsKey ReorderKernelFastBatch1::GetSupportedKey() const {
     k.EnableOutputLayout(DataLayout::bfwzyx);
     k.EnableOutputLayout(DataLayout::bs_f_bsv8__af8);
     k.EnableOutputLayout(DataLayout::bs_f_bsv16__af8);
-    k.EnableOutputLayout(DataLayout::bf8_xy16);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
     k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
 
index fd09861..c3ffb20 100644 (file)
@@ -19,7 +19,6 @@
 #include "reorder_from_winograd_2x3_kernel.h"
 #include "reorder_to_winograd_2x3_kernel.h"
 #include "reorder_kernel_to_yxfb_batched.h"
-#include "reorder_kernel_byxf_f32_to_byx8_f4_i8.h"
 #include "reorder_kernel_binary.h"
 #include "reorder_biplanar_nv12.h"
 #include "reorder_kernel_fs_b_yx_fsv32_to_bfyx.h"
@@ -33,7 +32,6 @@ reorder_kernel_selector::reorder_kernel_selector() {
     Attach<ReorderFromWinograd2x3Kernel>();
     Attach<ReorderToWinograd2x3Kernel>();
     Attach<ReorderKernel_to_yxfb_batched>();
-    Attach<reorder_kernel_byxf_f32_to_byx8_f4_i8>();
     Attach<reorder_biplanar_nv12>();
     Attach<ReorderKernel_fs_b_yx_fsv32_to_bfyx>();
 }
index 2ee687f..d7d7484 100644 (file)
@@ -61,8 +61,6 @@ static size_t packing_factor(const resample_params& params) {
             return 16;
         case DataLayout::b_fs_yx_fsv4:
             return 4;
-        case DataLayout::byxf_af32:
-            return 16;
         default:
             break;
         }
index 53ac874..5e7f52a 100644 (file)
@@ -17,7 +17,7 @@
 
 #if FP16_UNIT_USED
     #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
-    
+
     #define MULTIPLY_BLOCKS_16x8_8x16(_result, _blockA, _blockB) \
     { \
         const half16 acol0 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s0 ); \
@@ -64,9 +64,9 @@
 
 __attribute__((intel_reqd_sub_group_size(16)))
 KERNEL(convolution_bfyx_1x1)(
-    __global INPUT0_TYPE* input, 
-    __global OUTPUT_TYPE* output, 
-    __global FILTER_TYPE* weights, 
+    __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output,
+    __global FILTER_TYPE* weights,
 #if BIAS_TERM
     __global BIAS_TYPE* biases,
 #endif
@@ -107,10 +107,10 @@ KERNEL(convolution_bfyx_1x1)(
     {
         MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA00;
         MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB00;
-    
+
         uint input_idx = input_offset + k * 8 * xy_block_num * 16;
         uint filter_idx = filter_offset + k * 8 * 16;
-    
+
         blockA00 = ALIGNED_BLOCK_READ8(input, input_idx);
         blockB00 = ALIGNED_BLOCK_READ8(weights, filter_idx);
 
@@ -124,11 +124,7 @@ KERNEL(convolution_bfyx_1x1)(
 
     for(uint i = 0; i < 16; i++)
     {
-    #if OUTPUT_LAYOUT_BF8_XY16
-        const uint dst_index = GET_DATA_BF8_XY16_INDEX(OUTPUT, b, group_f+i, y, x) + out_split_offset;
-    #else
         const uint dst_index = GET_DATA_INDEX(OUTPUT, b, group_f+i, y, x) + out_split_offset;
-    #endif
     #if LEFTOVERS
         if(group_f+i < OUTPUT_FEATURE_NUM)
     #endif
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_af32_depthwise.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_af32_depthwise.cl
deleted file mode 100644 (file)
index 3c94e5b..0000000
+++ /dev/null
@@ -1,214 +0,0 @@
-// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/common.cl"
-
-#include "include/data_types.cl"
-#include "include/fetch.cl"
-#include "include/mmad.cl"
-
-#if STRIDE_SIZE_Y == DILATION_SIZE_Y
-    #define BLOCK_Y_SIZE (FILTER_SIZE_Y + (SPLIT_Y - 1))
-    #define LOAD_Y_WITH_STRIDES
-#else
-    #define BLOCK_Y_SIZE ((SPLIT_Y - 1) * STRIDE_SIZE_Y + (FILTER_SIZE_Y - 1) * (DILATION_SIZE_Y - 1) + FILTER_SIZE_Y)
-#endif
-
-#if STRIDE_SIZE_X == DILATION_SIZE_X
-    #define FILTER_SIZE_X_PRELOAD FILTER_SIZE_X
-    #define LOAD_X_WITH_STRIDES
-#else
-    #define FILTER_SIZE_X_PRELOAD FILTER_SIZE_X
-    #define LOAD_X_WITH_STRIDES
-    #define DONT_USE_X_SHIFTS
-#endif
-
-__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
-KERNEL(convolution_gpu_byxf_af32_depthwise)(
-    __global INPUT0_TYPE* input,
-    __global OUTPUT_TYPE* output,
-    __global FILTER_TYPE* weights,
-#if BIAS_TERM
-    __global BIAS_TYPE* biases,
-#endif
-#if HAS_FUSED_OPS_DECLS
-    FUSED_OPS_DECLS,
-#endif
-    uint split_idx)
-{
-    const uint x = get_global_id(1) * OUT_BLOCK_WIDTH;
-    const uint y = get_global_id(2) * SPLIT_Y;
-#if OUTPUT_BATCH_NUM == 1
-    const uint of = get_global_id(0);
-    const uint b = 0;
-#else
-    const uint of = (uint)get_global_id(0) % ALIGNED_OFM;
-    const uint b = (uint)get_global_id(0) / ALIGNED_OFM;
-#endif
-    const uint g = of;
-
-    if (of >= OUTPUT_FEATURE_NUM)
-        return;
-
-    int dotProd[SPLIT_Y] = {0};
-    OUTPUT_TYPE out[SPLIT_Y];
-    const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
-    const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
-    const uint filter_offset = g*FILTER_GROUPS_PITCH;
-    const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + g*FILTER_IFM_NUM;
-
-    // read all weights
-    FILTER_TYPE w[FILTER_IFM_PITCH];
-    __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
-    for (int j = 0; j < FILTER_SIZE_Y; j++) {
-        __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
-        for (int i = 0; i < FILTER_SIZE_X; i++) {
-            w[j * FILTER_SIZE_X + i] = weights[filter_offset + j * FILTER_Y_PITCH + i * FILTER_X_PITCH];
-        }
-    }
-
-    // initial input read
-    INPUT0_TYPE in[FILTER_SIZE_X_PRELOAD * BLOCK_Y_SIZE];
-    __attribute__((opencl_unroll_hint(BLOCK_Y_SIZE)))
-    for (int i = 0; i < BLOCK_Y_SIZE; i++) {
-        __attribute__((opencl_unroll_hint(FILTER_SIZE_X_PRELOAD)))
-        for (int j = 0; j < FILTER_SIZE_X_PRELOAD; j++) {
-#ifdef LOAD_Y_WITH_STRIDES
-            int input_offset_y = input_y + i * DILATION_SIZE_Y;
-#else
-            int input_offset_y = input_y + i;
-#endif
-#ifdef LOAD_X_WITH_STRIDES
-            int input_offset_x = input_x + j * DILATION_SIZE_X;
-#else
-            int input_offset_x = input_x + j;
-#endif
-            uint input_idx = input_offset + (uint)input_offset_x * INPUT0_X_PITCH + (uint)input_offset_y * INPUT0_Y_PITCH;
-            in[i * FILTER_SIZE_X_PRELOAD + j] = input[input_idx];
-        }
-    }
-
-#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
-    FUSED_OPS_PRELOAD;
-#endif
-
-    for (int l = 0; l < OUT_BLOCK_WIDTH; l++) {
-        //calculate dotproduct
-        __attribute__((opencl_unroll_hint(SPLIT_Y)))
-        for (int i = 0; i < SPLIT_Y; i++) {
-            __attribute__((opencl_unroll_hint(FILTER_IFM_PITCH)))
-            for (int j = 0; j < FILTER_IFM_PITCH; j++) {
-#if defined(LOAD_X_WITH_STRIDES) && defined(LOAD_Y_WITH_STRIDES)
-                const uint start_pos_y = i * FILTER_SIZE_X_PRELOAD;
-                dotProd[i] += (int)in[start_pos_y + j] * (int)w[j];
-#elif defined(LOAD_X_WITH_STRIDES) && !defined(LOAD_Y_WITH_STRIDES)
-                const uint start_pos_y = i * STRIDE_SIZE_Y * FILTER_SIZE_X_PRELOAD;
-                const uint pos_y = start_pos_y + (j / FILTER_SIZE_X) * DILATION_SIZE_Y * FILTER_SIZE_X_PRELOAD;
-                const uint pos_x = (j % FILTER_SIZE_X);
-                dotProd[i] += (int)in[pos_y + pos_x] * (int)w[j];
-#elif defined(LOAD_Y_WITH_STRIDES) && !defined(LOAD_X_WITH_STRIDES)
-                const uint start_pos_y = i * FILTER_SIZE_X_PRELOAD;
-                const uint pos_y = start_pos_y + (j / FILTER_SIZE_X) * FILTER_SIZE_X_PRELOAD;
-                const uint pos_x = (j % FILTER_SIZE_X) * DILATION_SIZE_X;
-                dotProd[i] += (int)in[pos_y + pos_x] * (int)w[j];
-#else
-                const uint start_pos_y = i * STRIDE_SIZE_Y * FILTER_SIZE_X_PRELOAD;
-                const uint pos_y = start_pos_y + (j / FILTER_SIZE_X) * DILATION_SIZE_Y * FILTER_SIZE_X_PRELOAD;
-                const uint pos_x = (j % FILTER_SIZE_X) * DILATION_SIZE_X;
-                dotProd[i] += (int)in[pos_y + pos_x] * (int)w[j];
-#endif  // defined(LOAD_X_WITH_STRIDES) && defined(LOAD_Y_WITH_STRIDES)
-            }
-        }
-
-        __attribute__((opencl_unroll_hint(BLOCK_Y_SIZE)))
-        for (int i = 0; i < BLOCK_Y_SIZE; i++) {
-            // inputs shift
-#ifndef DONT_USE_X_SHIFTS
-#if (FILTER_SIZE_X_PRELOAD - STRIDE_SIZE_X) > 0
-            __attribute__((opencl_unroll_hint(FILTER_SIZE_X_PRELOAD - STRIDE_SIZE_X)))
-#endif
-            for (int j = 0; j < FILTER_SIZE_X_PRELOAD - STRIDE_SIZE_X; j++) {
-                in[i * FILTER_SIZE_X_PRELOAD + j] = in[i * FILTER_SIZE_X_PRELOAD + j + STRIDE_SIZE_X];
-            }
-#endif
-
-            // read additional inputs
-#ifdef LOAD_Y_WITH_STRIDES
-            int input_offset_y = input_y + i * DILATION_SIZE_Y;
-#else
-            int input_offset_y = input_y + i;
-#endif  // LOAD_Y_WITH_STRIDES
-
-#if defined(DONT_USE_X_SHIFTS)
-            __attribute__((opencl_unroll_hint(FILTER_SIZE_X_PRELOAD)))
-            for (int j = 0; j < FILTER_SIZE_X_PRELOAD; j++) {
-                int input_offset_x = input_x + ((l + 1) * STRIDE_SIZE_X) + j * DILATION_SIZE_X;
-                uint input_idx = input_offset + (uint)input_offset_x * INPUT0_X_PITCH + (uint)input_offset_y * INPUT0_Y_PITCH;
-                in[i * FILTER_SIZE_X_PRELOAD + j] = input[input_idx];
-            }
-
-#else
-            {
-                int input_offset_x = input_x + ((l + 1) * STRIDE_SIZE_X) + (FILTER_SIZE_X - 1) * DILATION_SIZE_X;
-                uint input_idx = input_offset + (uint)input_offset_x * INPUT0_X_PITCH + (uint)input_offset_y * INPUT0_Y_PITCH;
-                in[i * FILTER_SIZE_X_PRELOAD + FILTER_SIZE_X_PRELOAD - 1] = input[input_idx];
-            }
-#endif  // defined(DONT_USE_X_SHIFTS)
-        }
-
-        __attribute__((opencl_unroll_hint(SPLIT_Y)))
-        for (int m = 0; m < SPLIT_Y; m++) {
-#if BIAS_TERM
-        #if BIAS_PER_OUTPUT
-        #if OUTPUT_LAYOUT_BYXF_AF32 == 1
-            const uint bias_index = GET_DATA_INDEX(BIAS, b, of, y + m, x + l);
-        #elif OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1
-            const uint bias_index = GET_DATA_B_FS_YX_FSV4_INDEX(BIAS, b, of, y + m, x + l);
-        #else
-            #error "Incorrect output layout"
-        #endif
-#elif BIAS_PER_OFM
-            const uint bias_index = of;
-#endif
-            // TODO: Maybe half should be supported as well.
-            float res = (float)dotProd[m] + biases[bias_index];
-#else
-            float res = (float)dotProd[m];
-#endif
-            dotProd[m] = 0;
-
-#if HAS_FUSED_OPS
-#if FUSED_OPS_CAN_USE_PRELOAD
-            FUSED_OPS_CALC;
-#else
-            FUSED_OPS;
-#endif
-            out[m] = FUSED_OPS_RESULT;
-#else
-            out[m] = TO_OUTPUT_TYPE(res);
-#endif
-        }
-
-        __attribute__((opencl_unroll_hint(SPLIT_Y)))
-        for (int m = 0; m < SPLIT_Y; m++) {
-#ifdef SPLIT_LEFTOVERS
-            if (y + m >= OUTPUT_SIZE_Y)
-                continue;
-#endif
-            const uint dst_index = OUTPUT_GET_INDEX(b, of, y + m, x + l);
-            output[dst_index] = ACTIVATION(out[m], ACTIVATION_PARAMS);
-        }
-    }  // OUT_BLOCK_WIDTH
-}
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl
deleted file mode 100644 (file)
index e343f22..0000000
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/common.cl"
-
-#include "include/data_types.cl"
-#include "include/fetch.cl"
-#include "include/mmad.cl"
-
-#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
-#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
-#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
-#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
-
-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
-KERNEL(convolution_MMAD)(
-    __global INPUT0_TYPE* input,
-    __global OUTPUT_TYPE* output,
-    __global FILTER_TYPE* weights,
-#if BIAS_TERM
-    __global BIAS_TYPE* biases,
-#endif
-#if HAS_FUSED_OPS_DECLS
-    FUSED_OPS_DECLS,
-#endif
-    uint split_idx)
-{
-    const uint x = get_global_id(0);
-    const uint y = get_global_id(1);
-#if OUTPUT_BATCH_NUM == 1
-    const uint f = get_global_id(2);
-    const uint b = 0;
-#else
-    const uint f = (uint)get_global_id(2) % FILTER_OFM_ALIGNED;
-    const uint b = (uint)get_global_id(2) / FILTER_OFM_ALIGNED;
-#endif
-
-    int dotProd = 0;
-
-    const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
-    const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
-    const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
-
-    const uint filter_offset = ((uint)get_group_id(2) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
-    const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
-
-    for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k)
-    {
-        for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
-        {
-            const int input_offset_y = input_y + j * DILATION_SIZE_Y;
-            const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
-
-            if(!zero_y)
-            {
-                for (uint i = 0; i < FILTER_SIZE_X ; ++i)
-                {
-                    const int input_offset_x = input_x + i * DILATION_SIZE_X;
-                    const bool zero_x = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
-
-                    if(!zero_x)
-                    {
-                        uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH + k*32;
-                        uint filter_idx = filter_offset + k*FILTER_Y_PITCH * FILTER_SIZE_Y + j*FILTER_Y_PITCH + i*FILTER_X_PITCH;
-
-                        PACKED_TYPE input_data = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
-                        MAKE_VECTOR_TYPE(PACKED_TYPE, 8) activations;  //activations of all lanes
-                        activations.s0 = sub_group_broadcast(input_data, 0);
-                        activations.s1 = sub_group_broadcast(input_data, 1);
-                        activations.s2 = sub_group_broadcast(input_data, 2);
-                        activations.s3 = sub_group_broadcast(input_data, 3);
-                        activations.s4 = sub_group_broadcast(input_data, 4);
-                        activations.s5 = sub_group_broadcast(input_data, 5);
-                        activations.s6 = sub_group_broadcast(input_data, 6);
-                        activations.s7 = sub_group_broadcast(input_data, 7);
-
-                        int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx)));
-
-                        dotProd = MMAD_8(activations, weights_data, dotProd);
-                    }
-                }
-            }
-        }
-    }
-
-#if BIAS_TERM
-#if   BIAS_PER_OUTPUT
-    const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x);
-#elif BIAS_PER_OFM
-    const uint bias_index = f;
-#endif
-    float res = (float)dotProd + biases[bias_index];
-#else
-    float res = (float)dotProd;
-#endif // BIAS_TERM
-
-#if HAS_FUSED_OPS
-    FUSED_OPS;
-    OUTPUT_TYPE result = FUSED_OPS_RESULT;
-#else
-    OUTPUT_TYPE result = TO_OUTPUT_TYPE(res);
-#endif
-
-    const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
-    const uint dst_index = OUTPUT_GET_INDEX(b, f, y, x) + out_split_offset;
-    output[dst_index] = result;
-}
-
-#undef FILTER_IFM_MMAD_NUM
-#undef FILTER_OFM_MMAD_NUM
-#undef FILTER_IFM_ALIGNED
-#undef FILTER_OFM_ALIGNED
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_blocks.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_blocks.cl
deleted file mode 100644 (file)
index d5acf18..0000000
+++ /dev/null
@@ -1,158 +0,0 @@
-// Copyright (c) 2016-2017 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/common.cl"
-
-#include "include/data_types.cl"
-#include "include/fetch.cl"
-#include "include/mmad.cl"
-
-#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
-#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
-#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
-#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
-
-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
-KERNEL(convolution_MMAD_blocks)(
-    __global INPUT0_TYPE* input,
-    __global OUTPUT_TYPE* output,
-    __global FILTER_TYPE* weights,
-#if BIAS_TERM
-    __global BIAS_TYPE* biases,
-#endif
-#if HAS_FUSED_OPS_DECLS
-    FUSED_OPS_DECLS,
-#endif
-    uint split_idx)
-{
-    const uint x = (uint)get_global_id(0) * OUTPUT_BLOCK_WIDTH;
-    const uint y = (uint)get_global_id(1) * OUTPUT_BLOCK_HEIGHT;
-#if OUTPUT_BATCH_NUM == 1
-    const uint f = (uint)get_global_id(2);
-    const uint b = 0;
-#else
-    const uint f = (uint)get_global_id(2) % FILTER_OFM_ALIGNED;
-    const uint b = (uint)get_global_id(2) / FILTER_OFM_ALIGNED;
-#endif
-
-    int acc[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT] = { 0 };
-    PACKED_TYPE in[IN_BLOCK_ARRAY_SIZE];
-
-    const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
-    const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
-    const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
-
-    const uint filter_offset = ((uint)get_group_id(2) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
-    const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
-
-    uint in_addr = input_offset + input_x * INPUT0_X_PITCH + input_y * INPUT0_Y_PITCH;
-    uint filter_idx = filter_offset;
-
-       __attribute__((opencl_unroll_hint(1)))
-    for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k)
-    {
-        // preload input data
-        for(uint in_block_pos = 0; in_block_pos < IN_BLOCK_ARRAY_SIZE; in_block_pos++)
-        {
-            uint block_x = in_block_pos % IN_BLOCK_WIDTH;
-            uint block_y = in_block_pos / IN_BLOCK_WIDTH;
-            uint input_idx = in_addr + block_x * INPUT0_X_PITCH + block_y * INPUT0_Y_PITCH;
-            in[in_block_pos] = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
-        }
-        // end of preloading input data
-
-        __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
-        for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
-        {
-                   __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
-            for (uint i = 0; i < FILTER_SIZE_X ; ++i)
-            {
-                int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx)));
-
-                           __attribute__((opencl_unroll_hint(OUTPUT_BLOCK_HEIGHT)))
-                for(uint br = 0; br < OUTPUT_BLOCK_HEIGHT; br++)
-                {
-                                   __attribute__((opencl_unroll_hint(OUTPUT_BLOCK_WIDTH)))
-                    for(uint bc = 0; bc < OUTPUT_BLOCK_WIDTH; bc++)
-                    {
-                        PACKED_TYPE input_data = in[(br * STRIDE_SIZE_Y + j) * IN_BLOCK_WIDTH + bc * STRIDE_SIZE_X + i];
-                        MAKE_VECTOR_TYPE(PACKED_TYPE, 8) activations;  //activations of all lanes
-                        activations.s0 = sub_group_broadcast(input_data, 0);
-                        activations.s1 = sub_group_broadcast(input_data, 1);
-                        activations.s2 = sub_group_broadcast(input_data, 2);
-                        activations.s3 = sub_group_broadcast(input_data, 3);
-                        activations.s4 = sub_group_broadcast(input_data, 4);
-                        activations.s5 = sub_group_broadcast(input_data, 5);
-                        activations.s6 = sub_group_broadcast(input_data, 6);
-                        activations.s7 = sub_group_broadcast(input_data, 7);
-
-                        acc[br * OUTPUT_BLOCK_WIDTH + bc] = MMAD_8(activations, weights_data, acc[br * OUTPUT_BLOCK_WIDTH + bc]);
-                    }
-                }
-                filter_idx += 32*8; // 32 features per channel * 8 output features per SIMD channel
-            }
-        }
-        in_addr += 32; // 4 features per channel * 8 SIMD channels
-    }
-
-#if BIAS_TERM
-#if   BIAS_PER_OUTPUT
-    const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x);
-#elif BIAS_PER_OFM
-    const uint bias_index = f;
-#endif
-#endif // BIAS_TERM
-
-    OUTPUT_TYPE out[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT] = { 0 };
-    for(uint br = 0; br < OUTPUT_BLOCK_HEIGHT; br++)
-    {
-        for(uint bc = 0; bc < OUTPUT_BLOCK_WIDTH; bc++)
-        {
-#if BIAS_TERM
-             // TODO: Maybe half should be supported as well.
-             float res = (float)acc[br * OUTPUT_BLOCK_WIDTH + bc] + biases[bias_index];
-#else
-             float res = (float)acc[br * OUTPUT_BLOCK_WIDTH + bc];
-#endif
-#if HAS_FUSED_OPS
-            FUSED_OPS;
-            out[br * OUTPUT_BLOCK_WIDTH + bc] = FUSED_OPS_RESULT;
-#else
-            out[br * OUTPUT_BLOCK_WIDTH + bc] = TO_OUTPUT_TYPE(res);
-#endif
-        }
-    }
-
-    const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
-    for(uint br = 0; br < OUTPUT_BLOCK_HEIGHT; br++)
-    {
-        if(y + br < OUTPUT_SIZE_Y)
-        {
-            for(uint bc = 0; bc < OUTPUT_BLOCK_WIDTH; bc++)
-            {
-                if(x + bc < OUTPUT_SIZE_X)
-                {
-                    const uint dst_index = OUTPUT_GET_INDEX(b, f, y+br, x+bc) + out_split_offset;
-                    output[dst_index] = out[br * OUTPUT_BLOCK_WIDTH + bc];
-                }
-            }
-        }
-    }
-}
-
-#undef FILTER_IFM_MMAD_NUM
-#undef FILTER_OFM_MMAD_NUM
-#undef FILTER_IFM_ALIGNED
-#undef FILTER_OFM_ALIGNED
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_af32_imad_1x1.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_af32_imad_1x1.cl
deleted file mode 100644 (file)
index 78da56d..0000000
+++ /dev/null
@@ -1,163 +0,0 @@
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/common.cl"
-
-#include "include/data_types.cl"
-#include "include/fetch.cl"
-#include "include/imad.cl"
-
-#if defined(ACCUMULATOR_TYPE)
-#undef ACCUMULATOR_TYPE
-#endif
-
-#if QUANTIZATION_TERM
-#    define ACCUMULATOR_TYPE int
-#    define ACTIVATION_TYPE float
-#    define TO_ACTIVATION_TYPE(x) convert_float(x)
-#else
-#    define ACCUMULATOR_TYPE INPUT0_TYPE
-#    define ACTIVATION_TYPE INPUT0_TYPE
-#    define TO_ACTIVATION_TYPE(x) TO_INPUT0_TYPE(x)
-#endif
-
-
-#define FILTER_IFM_SLICES_NUM ((FILTER_IFM_NUM + 31) / 32)
-#define FILTER_OFM_NUM_ALIGNED ((FILTER_OFM_NUM + SUB_GROUP_SIZE - 1) / SUB_GROUP_SIZE * SUB_GROUP_SIZE)
-
-// we are packing 4 8bit activations per 32 bit
-#define PACK 4
-
-#define AS_TYPE_N_(type, n, x) as_##type##n(x)
-#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
-#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
-
-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
-KERNEL(fused_conv_eltwise_gpu_af32_imad_1x1)(
-    const __global PACKED_TYPE* input,
-    __global OUTPUT_TYPE* restrict output,
-    const __global uint* weights,
-#if BIAS_TERM
-    __global BIAS_TYPE* biases,
-#endif
-#if HAS_FUSED_OPS_DECLS
-    FUSED_OPS_DECLS,
-#endif
-    uint split_idx)
-{
-    const uint x = (uint)get_global_id(0) * TILE_LENGTH % OUTPUT_SIZE_X;
-    const uint y = (uint)get_global_id(0) * TILE_LENGTH / OUTPUT_SIZE_X;
-    const uint f = (((uint)get_global_id(1) * TILE_DEPTH) % FILTER_OFM_NUM_ALIGNED) / (TILE_DEPTH * SUB_GROUP_SIZE) * (TILE_DEPTH * SUB_GROUP_SIZE);
-    const uint b = ((uint)get_global_id(1) * TILE_DEPTH) / FILTER_OFM_NUM_ALIGNED;
-    const uint lid = get_sub_group_local_id();
-
-    const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
-    const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
-    PACKED_TYPE input_slice[TILE_LENGTH];
-    int8 weights_slice;
-    ACCUMULATOR_TYPE accu[TILE_LENGTH][TILE_DEPTH] = {0};
-
-    uint filter_idx = f * FILTER_IFM_SLICES_NUM * 32 / PACK;
-    uint in_addr = (INPUT0_GET_INDEX(b, 0, input_y, input_x)) / PACK;
-
-    __attribute__((opencl_unroll_hint(1)))
-    for (uint k = 0; k < FILTER_IFM_SLICES_NUM; ++k)
-    {
-        // Read 32 input features for each pixel in the tile. 4 features in each int, 8 ints across SIMD
-        __attribute__((opencl_unroll_hint(TILE_LENGTH)))
-        for (uint i = 0; i < TILE_LENGTH; ++i)
-        {
-            uint tmp_addr = in_addr + i * INPUT0_X_PITCH * STRIDE_SIZE_X / PACK;
-            input_slice[i] = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*)input + tmp_addr));
-        }
-
-        // Loop through TILE_DEPTH output features
-        __attribute__((opencl_unroll_hint(TILE_DEPTH)))
-        for (uint of = 0; of < TILE_DEPTH; ++of)
-        {
-            // Read 32 weights. 8 ints, 4 weights in each int, each SIMD lane has own weghts
-            weights_slice = as_int8(intel_sub_group_block_read8(weights + filter_idx));
-
-            __attribute__((opencl_unroll_hint(TILE_LENGTH)))
-            for (uint i = 0; i < TILE_LENGTH; ++i)
-            {
-                PACKED_TYPE A_scalar;
-                A_scalar = sub_group_broadcast(input_slice[i], 0); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s0));
-                A_scalar = sub_group_broadcast(input_slice[i], 1); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s1));
-                A_scalar = sub_group_broadcast(input_slice[i], 2); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s2));
-                A_scalar = sub_group_broadcast(input_slice[i], 3); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s3));
-                A_scalar = sub_group_broadcast(input_slice[i], 4); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s4));
-                A_scalar = sub_group_broadcast(input_slice[i], 5); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s5));
-                A_scalar = sub_group_broadcast(input_slice[i], 6); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s6));
-                A_scalar = sub_group_broadcast(input_slice[i], 7); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s7));
-            }
-
-            filter_idx += 32 * 8 / 4; // 32 features per channel * 8 SIMD channels / sizeof(int)
-        }
-        in_addr += 4 * 8 / 4; // 4 features per channel * 8 SIMD channels / sizeof(int) -> next 32 input features
-    }
-
-#if TILE_DEPTH == 8
-    MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result[TILE_LENGTH];
-#elif TILE_DEPTH == 4
-    MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result[TILE_LENGTH];
-#endif
-
-    uint dst_index = (OUTPUT_GET_INDEX(b, f, y, x)) / PACK;
-
-    __attribute__((opencl_unroll_hint(TILE_LENGTH)))
-    for (uint i = 0; i < TILE_LENGTH; ++i)
-    {
-
-        __attribute__((opencl_unroll_hint(TILE_DEPTH)))
-        for (uint j = 0; j < TILE_DEPTH; ++j)
-        {
-            const uint f2 = f + lid * 4 + (j % 4) + (j / 4 * 32);
-            ACCUMULATOR_TYPE dotProd = accu[i][j];
-#if BIAS_TERM
-    #if BIAS_PER_OUTPUT
-            const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x);
-    #elif BIAS_PER_OFM
-            const uint bias_index = f2;
-    #endif
-            ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd)  + TO_ACTIVATION_TYPE(biases[bias_index]);
-#else
-            ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd);
-#endif //BIAS_TERM
-
-        #if HAS_FUSED_OPS
-            FUSED_OPS;
-            result[i][j] = FUSED_OPS_RESULT;
-        #else
-            result[i][j] = TO_OUTPUT_TYPE(res);
-        #endif
-        }
-    }
-
-    __attribute__((opencl_unroll_hint(TILE_LENGTH)))
-    for (uint i = 0; i < TILE_LENGTH; ++i)
-    {
-#if TILE_DEPTH == 8
-        intel_sub_group_block_write2((__global uint*)output + dst_index + i * OUTPUT_X_PITCH / PACK, as_uint2(result[i]));
-#elif TILE_DEPTH == 4
-        intel_sub_group_block_write((__global uint*)output + dst_index + i * OUTPUT_X_PITCH / PACK, as_uint(result[i]));
-#endif
-    }
-}
-#undef FILTER_IFM_SLICES_NUM
-#undef FILTER_OFM_NUM_ALIGNED
-#undef ACCUMULATOR_TYPE
-#undef ACTIVATION_TYPE
-#undef TO_ACTIVATION_TYPE
index f22ddde..f470f17 100644 (file)
@@ -129,7 +129,6 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
     #else
             in[reg] = AS_PACKED_TYPE(conv_input[in_addr]);// read SIMD_SIZE elements wide
     #endif
-            // TODO This will cause errors for byxf_af32 format on input
             in_addr += (INPUT0_SIZE_X + IWPAD);  // move to next row down
 #endif
         }
@@ -191,9 +190,7 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
             if(!zero_c)
             #endif
             {
-            #if OUTPUT_LAYOUT_BYXF_AF32 == 1
-                uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c);
-            #elif OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1
+            #if OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1
                 uint out_idx = output_idx_offset + r * output_row_size_bytes + (c*PACK);
             #elif OUTPUT_LAYOUT_B_FS_YX_FSV16 == 1 || OUTPUT_LAYOUT_BS_FS_YX_BSV16_FSV16 == 1
                 uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c);
index bd56709..a6af7a2 100644 (file)
         ((b) / (sub_group_size))*CAT(prefix, _BATCH_PITCH)              \
     )
 
-inline uint FUNC(get_bf8_xy16_index)(uint b, uint f, uint y, uint x, uint x_size, uint y_size, uint f_size, uint offset)
-{
-    const uint xy_idx = x + y * x_size;
-    const uint xy_offset = (xy_idx % 16) + (xy_idx / 16) * 16 * 8;
-    const uint xy_block_num = (x_size * y_size + 16 - 1) / 16;
-    const uint f_offset = (f % 8) * 16 + (f / 8) * xy_block_num * 16 * 8;
-    const uint f_block_num = (f_size + 8 - 1) / 8;
-    const uint b_offset = b * f_block_num * xy_block_num * 128;
-
-    const size_t idx = offset + xy_offset + f_offset + b_offset;
-
-    return idx;
-}
-
 inline uint FUNC(get_b_fs_yx_fsv_index)(uint b, uint f, uint y, uint x,
                                         uint x_size, uint y_size, uint f_size, uint b_size,
                                         uint b_pad_before, uint b_pad_after,
@@ -495,93 +481,6 @@ inline uint FUNC(get_os_zyxi_osv16_index)(uint o, uint i, uint z, uint y, uint x
         CAT(prefix, _SIZE_Y),                           \
         CAT(prefix, _SIZE_Z))
 
-inline uint FUNC(get_byxf_af32_index)(uint b, uint f, uint y, uint x, uint y_pitch, uint b_pitch, uint f_size, uint f_pad_before, uint f_pad_after, uint offset)
-{
-    const uint f_aligned_to_32 = ((f_size + 31) / 32) * 32;
-    const uint x_pitch = f_pad_before + f_aligned_to_32 + f_pad_after;
-    const uint b_offset = b * b_pitch;
-    const uint xy_offset = x_pitch * x + y_pitch * y;
-    const uint f_offset = f;
-    const size_t idx = offset + xy_offset + b_offset + f_offset;
-    return idx;
-}
-
-#define GET_DATA_BYXF_AF32_INDEX(prefix, b, f, y, x) \
-    FUNC_CALL(get_byxf_af32_index)(                  \
-        b, f, y, x, CAT(prefix, _Y_PITCH),           \
-        CAT(prefix, _BATCH_PITCH),                   \
-        CAT(prefix, _FEATURE_NUM),                   \
-        CAT(prefix, _PAD_BEFORE_FEATURE_NUM),        \
-        CAT(prefix, _PAD_AFTER_FEATURE_NUM),         \
-        CAT(prefix, _OFFSET))
-
-inline uint FUNC(get_byx8_f4_index)(uint b, uint f, uint y, uint x,
-    uint x_pitch, uint y_pitch, uint b_pitch, uint f_size, uint x_size, uint offset)
-{
-    const uint f_aligned_to_4 = ((f_size + 3) / 4) * 4;
-    const uint x_aligned_to_8 = ((x_size + 7) / 8) * 8;
-    const uint b_offset = b * b_pitch;
-    const uint xy_offset = x * x_pitch + y * y_pitch;
-    const uint f_offset = f;
-    const size_t idx = offset + xy_offset + b_offset + f_offset;
-    return idx;
-}
-
-#define GET_DATA_BYX8_F4_INDEX(prefix, b, f, y, x) \
-       FUNC_CALL(get_byx8_f4_index)(                  \
-               b, f, y, x, CAT(prefix, _X_PITCH),         \
-               CAT(prefix, _Y_PITCH),                     \
-               CAT(prefix, _BATCH_PITCH),                 \
-               CAT(prefix, _FEATURE_NUM),                 \
-               CAT(prefix, _SIZE_X),                      \
-               CAT(prefix, _OFFSET))
-
-#define GET_DATA_BF8_XY16_INDEX(prefix, b, f, y, x) \
-    FUNC_CALL(get_bf8_xy16_index)(                  \
-        b, f, y, x, CAT(prefix, _SIZE_X ),          \
-        CAT(prefix, _SIZE_Y),                       \
-        CAT(prefix, _FEATURE_NUM),                  \
-        CAT(prefix, _OFFSET))
-
-inline uint FUNC(get_fs_bs_yx_bsv4_fsv32_index)(uint b, uint f, uint y, uint x,
-    uint x_pad_before, uint x_size, uint x_pad_after,
-    uint y_pad_before, uint y_size, uint y_pad_after,
-    uint size_f, uint size_b)
-{
-    const uint f_32_aligned = ((size_f + 31)/32) * 32;
-    const uint b_4_aligned = ((size_b + 3)/4) * 4;
-    const uint fsv_idx = f % 32;
-    const uint bsv_idx = b % 4;
-    const uint fs_idx = f / 32;
-    const uint bs_idx = b / 4;
-
-    const uint x_pitch = 32 * 4;
-    const uint y_pitch = 32 * 4 * (x_pad_before + x_size + x_pad_after);
-    const uint bs_pitch = y_pitch * (y_pad_before + y_size + y_pad_after);
-    const uint fs_pitch = bs_pitch * (b_4_aligned / 4);
-    uint offset = x_pitch * x_pad_before + y_pitch * y_pad_before;
-
-    size_t idx = offset + fsv_idx + bsv_idx * 32;
-    idx += 32*4 * x;
-    idx += y * y_pitch;
-    idx += bs_idx * bs_pitch;
-    idx += fs_idx * fs_pitch;
-
-    return idx;
-}
-
-#define GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(prefix, b, f, y, x) \
-    FUNC_CALL(get_fs_bs_yx_bsv4_fsv32_index)(                  \
-        b, f, y, x,                                            \
-        CAT(prefix, _PAD_BEFORE_SIZE_X),                       \
-        CAT(prefix, _SIZE_X),                                  \
-        CAT(prefix, _PAD_AFTER_SIZE_X),                        \
-        CAT(prefix, _PAD_BEFORE_SIZE_Y),                       \
-        CAT(prefix, _SIZE_Y),                                  \
-        CAT(prefix, _PAD_AFTER_SIZE_Y),                        \
-        CAT(prefix, _FEATURE_NUM),                             \
-        CAT(prefix, _BATCH_NUM))
-
 #define GET_FILTER_GOIYX(prefix, g, o, i, y, x) \
     CAT(prefix, _OFFSET) +                      \
     (x)*CAT(prefix, _X_PITCH) +                 \
index 08c4bf3..003556b 100644 (file)
@@ -172,7 +172,7 @@ KERNEL(pooling_gpu_b_fs_yx_fsv4)(
     }
 #endif
 
-#if OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BYXF_AF32
+#if OUTPUT_LAYOUT_B_FS_YX_FSV4
     const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x);
 #if OUTPUT_FEATURE_NUM % 4 == 0
     *((__global OUTPUT_VEC4*)(output + output_pos)) = final_result;
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_af32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_af32.cl
deleted file mode 100644 (file)
index b3829ec..0000000
+++ /dev/null
@@ -1,189 +0,0 @@
-// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "include/include_all.cl"
-
-#define ACTIVATION_VEC4 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 4)
-#define TO_ACTIVATION_VEC4 CAT(convert_, ACTIVATION_VEC4)
-
-#define ACCUMULATOR_VEC4 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4)
-
-#define OUTPUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4)
-#define TO_OUTPUT_VEC4 CAT(convert_, OUTPUT_VEC4)
-
-#if MAX_POOLING
-    #define INIT_VAL ACCUMULATOR_VAL_MIN
-#elif AVG_POOLING
-    #define INIT_VAL ACCUMULATOR_VAL_ZERO
-#else
-    #error
-#endif
-
-inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
-{
-#if MAX_POOLING
-    return ACCUMULATOR_MAX_FUNC(tmp, in);
-#elif AVG_POOLING
-    return tmp + in;
-#endif
-}
-
-KERNEL(pooling_gpu_byxf_af32)(
-    const __global INPUT0_TYPE* input,
-    __global OUTPUT_TYPE* output
-#if HAS_FUSED_OPS_DECLS
-    , FUSED_OPS_DECLS
-#endif
-)
-{
-    const uint x    = (uint)get_global_id(0);
-    const uint y    = (uint)get_global_id(1);
-    const uint bf   = (uint)get_global_id(2);
-       // we process 4 features per workitem that's why we need to divide it
-    const uint aligned32_features = ((INPUT0_FEATURE_NUM + 31) / 32) * 32;
-    const uint f    = 4 * (bf % (aligned32_features / 4));
-    const uint b    = bf / (aligned32_features / 4);
-
-    typedef MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) input_t;
-    if (x >= OUTPUT_SIZE_X)
-    {
-        return;
-    }
-
-    const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
-    const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
-    ACCUMULATOR_VEC4 result = INIT_VAL;
-
-#ifdef CHECK_BOUNDRY
-    if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
-        offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
-    {
-        return;
-    }
-
-#ifdef DYNAMIC_KERNEL_DIVIDER
-    uint num_elementes = 0;
-#endif
-
-    const uint batch_and_feature_offset = GET_DATA_INDEX(INPUT0, b, f, 0, 0);
-    for(uint j = 0; j < POOL_SIZE_Y; j++)
-    {
-        int input_offset_y = offset_y + j;
-        bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
-        if(!zero_y)
-        {
-            for(uint i = 0; i < POOL_SIZE_X; i++)
-            {
-                int input_offset_x = offset_x + i;
-                bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
-                if(!zero)
-                {
-                    const uint input_idx = batch_and_feature_offset + input_offset_y*INPUT0_Y_PITCH + input_offset_x*INPUT0_X_PITCH;
-
-                    input_t input_data = AS_INPUT_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
-                    result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0]));
-                    result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1]));
-                    result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2]));
-                    result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3]));
-
-#ifdef DYNAMIC_KERNEL_DIVIDER
-                    num_elementes++;
-#endif
-                }
-            }
-        }
-    }
-#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
-    const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
-    const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
-    const uint num_elementes = (hend - offset_y) * (wend - offset_x);
-#endif
-#else
-    uint input_idx = GET_DATA_INDEX(INPUT0, b, f, offset_y, offset_x);
-
-    for(uint j = 0; j < POOL_SIZE_Y; j++)
-    {
-        for(uint i = 0; i < POOL_SIZE_X; i++)
-        {
-            input_t input_data = AS_INPUT_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
-            result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0]));
-            result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1]));
-            result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2]));
-            result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3]));
-
-            input_idx += INPUT0_X_PITCH;
-        }
-        input_idx += (INPUT0_Y_PITCH - POOL_SIZE_X*INPUT0_X_PITCH);
-    }
-
-#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
-    const uint num_elementes = POOL_SIZE_X*POOL_SIZE_Y;
-#endif
-#endif
-
-#if defined AVG_POOLING
-#if ENABLE_ROUND
-    int4 not_fused_result;
-    for (uint i = 0; i < 4; ++i) {
-    #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
-        not_fused_result[i] = convert_int(round(((float)result[i] / max(num_elementes, (uint)1)));
-    #else
-        not_fused_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
-    #endif
-    }
-#else  // ENABLE_ROUND
-    float4 not_fused_result;
-    for (uint i = 0; i < 4; ++i) {
-    #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
-        not_fused_result[i] = (float)result[i] / max(num_elementes, (uint)1);
-    #else
-        not_fused_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X);
-    #endif
-    }
-#endif  // ENABLE_ROUND
-#else  // AVG_POOLING
-    float4 not_fused_result = convert_float4(result);
-#endif  // AVG_POOLING
-
-    OUTPUT_VEC4 final_result;
-#if HAS_FUSED_OPS
-    ACTIVATION_VEC4 fused_pool_result = TO_ACTIVATION_VEC4(not_fused_result);
-    FUSED_OPS;
-    final_result = FUSED_OPS_RESULT;
-    for(uint op = 0; op < 4; op++)
-    {
-        const uint output_pos = GET_DATA_INDEX(OUTPUT, b, f+op, y, x);
-        output[output_pos] = final_result[op];
-    }
-#else
-    final_result = TO_OUTPUT_VEC4(not_fused_result);
-    for(uint op = 0; op < 4; op++)
-    {
-        const uint output_pos = GET_DATA_INDEX(OUTPUT, b, f+op, y, x);
-        final_result[op] = TO_OUTPUT_TYPE(ACTIVATION(not_fused_result[op], ACTIVATION_PARAMS));
-        output[output_pos] = final_result[op];
-    }
-#endif
-}
-
-#undef INIT_VAL
-#undef ACCUMULATOR_VEC4
-
-#undef ACTIVATION_VEC4
-#undef TO_ACTIVATION_VEC4
-
-#undef OUTPUT_VEC4
-#undef TO_OUTPUT_VEC4
index 244d32f..572b29a 100644 (file)
@@ -40,7 +40,7 @@ KERNEL(pooling_gpu_int8_ref)(
 #endif
 )
 {
-#if OUTPUT_LAYOUT_BFYX  || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_BYXF_AF32 || OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BFZYX
+#if OUTPUT_LAYOUT_BFYX  || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BFZYX
     const uint x    = (uint)get_global_id(0);
     const uint yz   = (uint)get_global_id(1);
 #if OUTPUT_DIMS == 5
index 3b68dd2..098d3b5 100644 (file)
@@ -44,7 +44,7 @@ KERNEL(pooling_gpu)(
 )
 {
 #if OUTPUT_LAYOUT_BFYX  || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_BFZYX ||\
-    OUTPUT_LAYOUT_B_FS_ZYX_FSV16 || OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16 || OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BYXF_AF32
+    OUTPUT_LAYOUT_B_FS_ZYX_FSV16 || OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16 || OUTPUT_LAYOUT_B_FS_YX_FSV4
     const uint x    = (uint)get_global_id(0);
 #if OUTPUT_DIMS == 5
     const uint y   = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
index cb4014e..81c33bb 100644 (file)
@@ -27,16 +27,8 @@ inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint
 #elif defined OUTPUT_LAYOUT_BS_F_BSV8__AF8  || \
       defined OUTPUT_LAYOUT_BS_F_BSV16__AF8
     return GET_DATA_BS_FYX_BSV8_INDEX(OUTPUT, b, f, y, x, SUB_GROUP_SIZE);
-#elif defined OUTPUT_LAYOUT_BF8_XY16
-    return GET_DATA_BF8_XY16_INDEX(OUTPUT, b, f, y, x);
 #elif defined OUTPUT_LAYOUT_B_FS_YX_FSV16
     return GET_DATA_B_FS_YX_FSV16_INDEX(OUTPUT, b, f, y, x);
-#elif defined OUTPUT_LAYOUT_BYXF_AF32
-    return GET_DATA_BYXF_AF32_INDEX(OUTPUT, b, f, y, x);
-#elif defined OUTPUT_LAYOUT_BYX8_F4
-    return GET_DATA_BYX8_F4_INDEX(OUTPUT, b, f, y, x);
-#elif defined OUTPUT_LAYOUT_FS_BS_YX_BSV4_FSV32
-    return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x);
 #elif defined OUTPUT_LAYOUT_B_FS_YX_FSV4
     return GET_DATA_B_FS_YX_FSV4_INDEX(OUTPUT, b, f, y, x);
 #elif defined OUTPUT_LAYOUT_FS_B_YX_FSV32
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_byxf_f32_to_byx8_f4_i8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_byxf_f32_to_byx8_f4_i8.cl
deleted file mode 100644 (file)
index f385f07..0000000
+++ /dev/null
@@ -1,130 +0,0 @@
-// Copyright (c) 2016-2017 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "include/reshape_dims.cl"
-#include "include/fetch.cl"
-
-#include "include/data_types.cl"
-
-///////////////////////// Input Index /////////////////////////
-inline uint FUNC(get_input_index)(uint b, uint f, uint y, uint x)
-{
-#if   INPUT0_SIMPLE
-    return GET_DATA_INDEX(INPUT0, b, f, y, x);
-#elif defined INPUT0_LAYOUT_BS_F_BSV8__AF8  || \
-      defined INPUT0_LAYOUT_BS_F_BSV16__AF8
-    return GET_DATA_BS_FYX_BSV8_INDEX(INPUT0, b, f, y, x, SUB_GROUP_SIZE);
-#elif defined INPUT0_LAYOUT_BF8_XY16
-    return GET_DATA_BF8_XY16_INDEX(INPUT0, b, f, y, x);
-#elif defined INPUT0_LAYOUT_BYXF_AF32
-       return GET_DATA_BYXF_AF32_INDEX(INPUT0, b, f, y, x);
-#elif defined INPUT0_LAYOUT_BYX8_F4
-       return GET_DATA_BYX8_F4_INDEX(INPUT0, b, f, y, x);
-#elif defined INPUT0_LAYOUT_FS_BS_YX_BSV4_FSV32
-    return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b, f, y, x);
-#elif defined INPUT0_LAYOUT_B_FS_YX_FSV4
-    return GET_DATA_B_FS_YX_FSV4_INDEX(INPUT0, b, f, y, x);
-#else
-#error reorder_data.cl: input format - not supported
-#endif
-}
-
-///////////////////////// Output Index /////////////////////////
-
-inline uint FUNC(get_output_index)(uint b, uint f, uint y, uint x)
-{
-#if   OUTPUT_SIMPLE
-    return GET_DATA_INDEX(OUTPUT, b, f, y, x);
-#elif defined OUTPUT_LAYOUT_BS_F_BSV8__AF8  || \
-      defined OUTPUT_LAYOUT_BS_F_BSV16__AF8
-    return GET_DATA_BS_FYX_BSV8_INDEX(OUTPUT, b, f, y, x, SUB_GROUP_SIZE);
-#elif defined OUTPUT_LAYOUT_BF8_XY16
-    return GET_DATA_BF8_XY16_INDEX(OUTPUT, b, f, y, x);
-#elif defined OUTPUT_LAYOUT_BYXF_AF32
-       return GET_DATA_BYXF_AF32_INDEX(OUTPUT, b, f, y, x);
-#elif defined OUTPUT_LAYOUT_BYX8_F4
-       return GET_DATA_BYX8_F4_INDEX(OUTPUT, b, f, y, x);
-#elif defined OUTPUT_LAYOUT_FS_BS_YX_BSV4_FSV32
-    return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x);
-#elif defined OUTPUT_LAYOUT_B_FS_YX_FSV4
-    return GET_DATA_B_FS_YX_FSV4_INDEX(OUTPUT, b, f, y, x);
-#else
-#error reorder_data.cl: output format - not supported
-#endif
-}
-
-__attribute__((intel_reqd_sub_group_size(16)))
-KERNEL (reorder_data_byxf_f32_to_byx8_f4_i8)(
-    const __global INPUT_REORDER_TYPE* input, 
-    __global OUTPUT_REORDER_TYPE* output
-#ifdef MEAN_SUBTRACT_IN_BUFFER
-    , __global MEAN_SUBTRACT_TYPE* mean_subtract
-#endif
-    )
-{
-    const uint x = get_global_id(0);
-    const uint y = get_group_id(1);
-    const uint b = (uint)get_group_id(2) * WG_BATCH_SIZE + (uint)get_sub_group_id();
-
-    const uint input_idx  = FUNC_CALL(get_input_index)(b, 0, y, x);
-    const uint output_idx = FUNC_CALL(get_output_index)(b, 0, y, x);
-
-#if defined MEAN_SUBTRACT_INSIDE_PARAMS
-    float4 res;
-    res.s0 = TO_MEAN_TYPE(input[input_idx]);
-    res.s0 = MEAN_OP(res.s0, VALUE_TO_SUBTRACT[0 % VALUE_TO_SUBTRACT_SIZE]);
-    res.s1 = TO_MEAN_TYPE(input[input_idx+1]);
-    res.s1 = MEAN_OP(res.s1, VALUE_TO_SUBTRACT[1 % VALUE_TO_SUBTRACT_SIZE]);
-    res.s2 = TO_MEAN_TYPE(input[input_idx+2]);
-    res.s2 = MEAN_OP(res.s2, VALUE_TO_SUBTRACT[2 % VALUE_TO_SUBTRACT_SIZE]);
-    res.s3 = 0;
-#elif defined MEAN_SUBTRACT_IN_BUFFER
-#if defined MEAN_PER_FEATURE
-    MAKE_VECTOR_TYPE(MEAN_SUBTRACT_TYPE, 4) res;
-    res.s0 = TO_MEAN_TYPE(input[input_idx]);
-    res.s0 = MEAN_OP(res.s0, mean_subtract[0]);
-    res.s1 = TO_MEAN_TYPE(input[input_idx+1]);
-    res.s1 = MEAN_OP(res.s1, mean_subtract[1]);
-    res.s2 = TO_MEAN_TYPE(input[input_idx+2]);
-    res.s2 = MEAN_OP(res.s2, mean_subtract[2]);
-    res.s3 = 0
-#else
-    MAKE_VECTOR_TYPE(MEAN_SUBTRACT_TYPE, 4) res;
-    res.s0 = TO_MEAN_TYPE(input[input_idx]);
-    res.s1 = TO_MEAN_TYPE(input[input_idx+1]);
-    res.s2 = TO_MEAN_TYPE(input[input_idx+2]);
-    res.s3 = 0; 
-
-    res.s0 = MEAN_OP(res.s0, mean_subtract[0]);
-    res.s1 = MEAN_OP(res.s1, mean_subtract[1]);
-    res.s2 = MEAN_OP(res.s2, mean_subtract[2]);
-#endif
-#else
-    MAKE_VECTOR_TYPE(CALC_TYPE, 4) res;
-    res.s0 = TO_CALC_TYPE(input[input_idx]);
-    res.s1 = TO_CALC_TYPE(input[input_idx+1]);
-    res.s2 = TO_CALC_TYPE(input[input_idx+2]);
-    res.s3 = 0;
-#endif
-
-    char4 out_vals;
-    out_vals.s0 = ACTIVATION_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE_SAT(res.s0), ACTIVATION_PARAMS_TYPED);
-    out_vals.s1 = ACTIVATION_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE_SAT(res.s1), ACTIVATION_PARAMS_TYPED);
-    out_vals.s2 = ACTIVATION_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE_SAT(res.s2), ACTIVATION_PARAMS_TYPED);
-    out_vals.s3 = 0;
-
-    __global uint* dst = (__global uint*)output;
-    dst[output_idx/4] = as_uint(out_vals);
-}
index 8c1360a..45c1ff6 100644 (file)
@@ -30,8 +30,6 @@ inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x
 #elif defined INPUT0_LAYOUT_BS_F_BSV8__AF8  || \
       defined INPUT0_LAYOUT_BS_F_BSV16__AF8
     return GET_DATA_BS_FYX_BSV8_INDEX(INPUT0, b, f, y, x, SUB_GROUP_SIZE);
-#elif defined INPUT0_LAYOUT_BF8_XY16
-    return GET_DATA_BF8_XY16_INDEX(INPUT0, b, f, y, x);
 #elif defined INPUT0_LAYOUT_B_FS_YX_FSV16
     return GET_DATA_B_FS_YX_FSV16_INDEX(INPUT0, b, f, y, x);
 #elif defined INPUT0_LAYOUT_B_FS_ZYX_FSV16
@@ -54,8 +52,6 @@ inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint
 #elif defined OUTPUT_LAYOUT_BS_F_BSV8__AF8  || \
       defined OUTPUT_LAYOUT_BS_F_BSV16__AF8
     return GET_DATA_BS_FYX_BSV8_INDEX(OUTPUT, b, f, y, x, SUB_GROUP_SIZE);
-#elif defined OUTPUT_LAYOUT_BF8_XY16
-    return GET_DATA_BF8_XY16_INDEX(OUTPUT, b, f, y, x);
 #elif defined OUTPUT_LAYOUT_B_FS_YX_FSV16
     return GET_DATA_B_FS_YX_FSV16_INDEX(OUTPUT, b, f, y, x);
 #elif defined OUTPUT_LAYOUT_B_FS_ZYX_FSV16
index 7ed82f3..f6014c6 100644 (file)
@@ -26,10 +26,6 @@ inline uint FUNC(get_input_index)(uint b, uint f, uint y, uint x)
 #elif defined INPUT0_LAYOUT_BS_F_BSV8__AF8  || \
       defined INPUT0_LAYOUT_BS_F_BSV16__AF8
     return GET_DATA_BS_FYX_BSV8_INDEX(INPUT0, b, f, y, x, SUB_GROUP_SIZE);
-#elif defined INPUT0_LAYOUT_BF8_XY16
-    return GET_DATA_BF8_XY16_INDEX(INPUT0, b, f, y, x);
-#elif defined INPUT0_LAYOUT_BYXF_AF32
-       return GET_DATA_BYXF_AF32_INDEX(INPUT0, b, f, y, x);
 #elif defined INPUT0_LAYOUT_B_FS_YX_FSV16
     return GET_DATA_B_FS_YX_FSV16_INDEX(INPUT0, b, f, y, x);
 #elif defined INPUT0_LAYOUT_FS_B_YX_FSV32
index e0d7048..4b5176c 100644 (file)
@@ -330,8 +330,6 @@ JitDefinitions DataTensorJitConstant::GetDefinitions() const {
                 raw_index_func_val = "GET_DATA_INDEX_RAW(" + _name + ", b, f, y, x)";
             } else if (layout == DataLayout::b_fs_yx_fsv16 ||
                        layout == DataLayout::b_fs_yx_fsv32 ||
-                       layout == DataLayout::byxf_af32 ||
-                       layout == DataLayout::fs_bs_yx_bsv4_fsv32 ||
                        layout == DataLayout::b_fs_yx_fsv4 ||
                        layout == DataLayout::fs_b_yx_fsv32 ||
                        layout == DataLayout::bs_fs_yx_bsv16_fsv16) {
index b175572..d8aa3c5 100644 (file)
@@ -103,11 +103,7 @@ std::string toString(DataLayout l) {
         case kernel_selector::DataLayout::b_fs_zyx_fsv32:        return "B_FS_ZYX_FSV32";
         case kernel_selector::DataLayout::bs_f_bsv8__af8:        return "BS_F_BSV8__AF8";
         case kernel_selector::DataLayout::bs_f_bsv16__af8:       return "BS_F_BSV16__AF8";
-        case kernel_selector::DataLayout::bf8_xy16:              return "BF8_XY16";
         case kernel_selector::DataLayout::winograd_2x3_s1_data:  return "WINOGRAD_2x3_S1_DATA";
-        case kernel_selector::DataLayout::byxf_af32:             return "BYXF_AF32";
-        case kernel_selector::DataLayout::byx8_f4:               return "BYX8_F4";
-        case kernel_selector::DataLayout::fs_bs_yx_bsv4_fsv32:   return "FS_BS_YX_BSV4_FSV32";
         case kernel_selector::DataLayout::b_fs_yx_fsv4:          return "B_FS_YX_FSV4";
         case kernel_selector::DataLayout::b_fs_yx_32fp:          return "B_FS_YX_32FP";
         case kernel_selector::DataLayout::bfzyx:                 return "BFZYX";
index c382842..03e9227 100644 (file)
@@ -272,40 +272,6 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) {
         return {output_type, format::b_fs_yx_32fp, output_size};
     }
 
-    // due to performance reason for using fs_bs_yx_bsv4_fsv32 first convolution have 3 features, so first conv layer
-    // will take byxf and return fs_bs_yx_bsv4_fsv32
-    if (input_layout.data_type == data_types::i8 && input_layout.format == format::byx8_f4 &&
-        input_layout.size.batch[0] % 4 == 0 && input_layout.size.feature[0] == 3) {
-        return layout{output_type, cldnn::format::fs_bs_yx_bsv4_fsv32, output_size};
-    }
-
-    auto users = node.get_users();
-    if (users.size() == 1 && users.front()->is_type<convolution>()) {
-        auto conv_split = users.front()->as<convolution>().get_split();
-        auto conv_groups = (int32_t)users.front()->as<convolution>().get_groups();
-
-        bool next_is_dw = ((conv_split > 1 && conv_split == output_size.feature[0]) ||
-                           (conv_groups > 1 && conv_groups == output_size.feature[0]));
-
-        if (input_layout.data_type == data_types::i8 && input_layout.format == format::b_fs_yx_fsv4 && next_is_dw) {
-            return layout{output_type, cldnn::format::byxf_af32, output_size};
-        }
-
-        auto prev_node = node.get_dependencies().front();
-        if (prev_node->is_type<reorder>())
-            prev_node = prev_node->get_dependencies().front();
-
-        auto prev_is_convo = prev_node->is_type<convolution>();
-        if (prev_is_convo) {
-            auto prev2_node = prev_node->get_dependencies().front();
-            auto prev_input_format = prev2_node->get_output_layout().format;
-
-            if (input_layout.data_type == data_types::i8 && input_layout.format == format::byxf_af32 && !next_is_dw &&
-                prev_input_format == format::b_fs_yx_fsv4) {
-                return layout{output_type, cldnn::format::b_fs_yx_fsv4, output_size};
-            }
-        }
-    }
     return {output_type, input_layout.format, output_size};
 }
 
index a000e9a..5d8de98 100644 (file)
@@ -230,13 +230,6 @@ layout fused_conv_eltwise_inst::calc_output_layout(fused_conv_eltwise_node const
                        output_range.spatial[1],
                        output_range.spatial[2]);
 
-    // due to performance reason for using fs_bs_yx_bsv4_fsv32 first convolution have 3 features, so first conv layer
-    // will take byxf and return fs_bs_yx_bsv4_fsv32
-    if (input_layout.data_type == data_types::i8 && input_layout.format == format::byx8_f4 &&
-        input_layout.size.batch[0] % 4 == 0 && input_layout.size.feature[0] == 3) {
-        return layout{output_type, cldnn::format::fs_bs_yx_bsv4_fsv32, output_size};
-    }
-
     return {output_type, input_layout.format, output_size};
 }
 
index dedf1b3..3d4cfc2 100644 (file)
@@ -150,8 +150,6 @@ attach_concatenation_gpu::attach_concatenation_gpu() {
         {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), concatenation_gpu::create},
         {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), concatenation_gpu::create},
         // MMAD
-        {std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), concatenation_gpu::create},
-        {std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), concatenation_gpu::create},
         {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), concatenation_gpu::create},
         {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), concatenation_gpu::create},
         {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv32), concatenation_gpu::create},
index 6f7ce89..dcba536 100644 (file)
@@ -189,8 +189,6 @@ attach_convolution_gpu::attach_convolution_gpu() {
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfzyx), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::winograd_2x3_s1_data), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::winograd_2x3_s1_data), val_fw);
-    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bf8_xy16), val_fw);
-    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bf8_xy16), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw);
     // block f16 format
@@ -202,11 +200,6 @@ attach_convolution_gpu::attach_convolution_gpu() {
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv16), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv16), val_fw);
     // MMAD
-    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw);
-    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw);
-    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), val_fw);
-    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw);
-    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byx8_f4), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), val_fw);
 
@@ -216,7 +209,6 @@ attach_convolution_gpu::attach_convolution_gpu() {
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv32), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv32), val_fw);
 
-    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), val_fw);
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/definitions.cl b/inference-engine/thirdparty/clDNN/src/gpu/definitions.cl
deleted file mode 100644 (file)
index 9f719e6..0000000
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#ifdef CODE_PREFIX
-#define CODE_BEGIN CODE_PREFIX
-#define CODE_END CODE_POSTFIX
-#else
-#define CODE_BEGIN
-#define CODE_END
-#endif
-
-CODE_BEGIN
-enum neural_memory_format {
-    x_f32,
-    xb_f32,     // 1D+batch, float32
-    bx_f32,     // 1D+batch, float32
-    yxfb_f32,   // 3D+batch, float32
-    byxf_f32,   // for convolution_cpu_jit_batch1
-    bfyx_f32,   // used in Caffe
-    fyxb_f32,   // used in Caffe
-    oiyx_f32,   // format used only for weights: o - output feature maps, i - input feature maps
-    byxf_b24_f32,        // for convolution_cpu_generic
-    yxoi_o4_f32,       // for convolution_cpu_generic
-    os_yxi_sv16_f32,   // format used only for weights: os - output slice, i - input feature maps, sv16 - 16 values of single slice
-    bs_yxf_bv24_f32,
-    any=-1
-};
-
-#pragma pack(push, 4)
-typedef struct _neural_memory_tag {
-    uint format;
-    uint feature_offset;
-    uint spatial_offset;
-    uint vector_size;
-    uint data_offset;
-    uint data[1];
-} neural_memory;
-
-typedef struct _neural_vector_tag {
-    uint feature_offset;
-    uint spatial_offset;
-    uint raw_size;
-    uint data[1];
-} neural_vector;
-#pragma pack(pop)
-
-// neural_memory accessors
-__attribute__((overloadable)) __global uint* get_raw(__global neural_memory* mem) { return &(mem->data[0]); }
-__attribute__((overloadable)) const __global uint* get_raw(const __global neural_memory* mem) { return &(mem->data[0]); }
-__attribute__((overloadable)) uint get_raw_size(const __global neural_memory* mem) { return mem->vector_size; } 
-
-__attribute__((overloadable)) __global uint* get_batch(__global neural_memory* mem) { return get_raw(mem); }
-__attribute__((overloadable)) const __global uint* get_batch(const __global neural_memory* mem) { return get_raw(mem); }
-__attribute__((overloadable)) uint get_batch_size(const __global neural_memory* mem) { return mem->feature_offset; }
-
-__attribute__((overloadable)) __global uint* get_feature(__global neural_memory* mem) { return &(mem->data[mem->feature_offset]); }
-__attribute__((overloadable)) const __global uint* get_feature(const __global neural_memory* mem) { return &(mem->data[mem->feature_offset]); }
-__attribute__((overloadable)) uint get_feature_size(const __global neural_memory* mem) { return mem->spatial_offset - mem->feature_offset; }
-
-__attribute__((overloadable)) __global uint* get_spatial(__global neural_memory* mem) { return &(mem->data[mem->spatial_offset]); }
-__attribute__((overloadable)) const __global uint* get_spatial(const __global neural_memory* mem) { return &(mem->data[mem->spatial_offset]); }
-__attribute__((overloadable)) uint get_spatial_size(const __global neural_memory* mem) { return get_raw_size(mem) - mem->spatial_offset; } 
-
-__attribute__((overloadable)) __global void* get_data(__global neural_memory* mem) { return &(mem->data[mem->data_offset]); }
-__attribute__((overloadable)) const __global void* get_data(const __global neural_memory* mem) { return &(mem->data[mem->data_offset]); }
-__attribute__((overloadable)) size_t get_element_size(const __global neural_memory* mem) { return sizeof(float); }
-
-__attribute__((overloadable)) size_t get_data_size(const __global neural_memory* mem) {
-    size_t result = get_element_size(mem);
-
-    const __global uint* raw = get_raw(mem);
-    uint raw_size = get_raw_size(mem);
-
-    for(uint i = 0; i < raw_size; i++) {
-        result *= raw[i];
-    }
-    return result;
-}
-
-// neural_vector accessors
-// TODO NOTE: non-const accessors are disabled now, because read-only neural_vector argument is only supported now
-
-//__attribute__((overloadable)) __global uint* get_raw(__global neural_vector* v) { return &(v->data[0]); }
-__attribute__((overloadable)) const __global uint* get_raw(const __global neural_vector* v) { return &(v->data[0]); }
-__attribute__((overloadable)) uint get_raw_size(const __global neural_vector* v) { return v->raw_size; } 
-
-//__attribute__((overloadable)) __global uint* get_batch(__global neural_vector* v) { return get_raw(v); }
-__attribute__((overloadable)) const __global uint* get_batch(const __global neural_vector* v) { return get_raw(v); }
-__attribute__((overloadable)) uint get_batch_size(const __global neural_vector* v) { return v->feature_offset; }
-
-//__attribute__((overloadable)) __global uint* get_feature(__global neural_vector* v) { return &(v->data[v->feature_offset]); }
-__attribute__((overloadable)) const __global uint* get_feature(const __global neural_vector* v) { return &(v->data[v->feature_offset]); }
-__attribute__((overloadable)) uint get_feature_size(const __global neural_vector* v) { return v->spatial_offset - v->feature_offset; }
-
-//__attribute__((overloadable)) __global uint* get_spatial(__global neural_vector* v) { return &(v->data[v->spatial_offset]); }
-__attribute__((overloadable)) const __global uint* get_spatial(const __global neural_vector* v) { return &(v->data[v->spatial_offset]); }
-__attribute__((overloadable)) uint get_spatial_size(const __global neural_vector* v) { return get_raw_size(v) - v->spatial_offset; } 
-
-CODE_END
-
-/*
-KERNEL(Fully_Connected_GPU)
-DECALRE_CONSTANT()
-BEGIN_ARGUMENTS_DECLARATION
-DECLARE_INPUT_MEMORY_ARGUMENT(input_mem)
-DECLARE_INPUT_MEMORY_ARGUMENT(weights_mem)
-DECLARE_INPUT_MEMORY_ARGUMENT(bias_mem)
-DECLARE_OUTPUT_MEMORY_ARGUMENT(dst_mem)
-END_ARGUMENTS_DECLARATION
-CODE_BEGIN
-#define WEIGHTS { 1.0, 3.2, 4.5, 6.7 }
-#define WEIGHTS_SIZE { 2, 2 }
-#define WEIGHTS_DIM 2
-*/
-__kernel void Fully_Connected_GPU(__global neural_memory* input_mem, __global neural_memory* weights_mem, __global neural_memory* bias_mem, __global neural_memory* dst_mem)
-{
-    __global uint* input_size = get_raw(input_mem);
-    __global uint* weights_size = get_raw(weights_mem);
-    __global float* input = (__global float*)get_data(input_mem);
-    __global float* weights = (__global float*)get_data(weights_mem);
-    __global float* bias = (__global float*)get_data(bias_mem);
-    __global float* pDst = (__global float*)get_data(dst_mem);
-
-    const int x = get_global_id(0);
-
-    pDst[x] = 0;
-    uint outXIdx = x / input_size[0];
-    uint inputBatchIdx = x % input_size[0];
-    uint weightYIdx = outXIdx * weights_size[0];
-    for (uint i = 0; i < input_size[2]; i++)
-    {
-        pDst[x] += input[i * input_size[0] + inputBatchIdx] * weights[weightYIdx + i];
-    }
-    pDst[x] += bias[outXIdx];
-}
-CODE_END
-
-CODE_BEGIN
-__kernel void Convolution_GPU(
-    const __global neural_memory* input_mem,
-    const __global neural_memory* filter_mem,
-    float bias,
-    __global neural_memory* dst_mem,
-    const __global neural_vector* spatial_stride)
-{
-
-//
-    const __global uint* input_size = get_raw(input_mem);
-    const __global uint* filter_size = get_raw(filter_mem);
-    const __global uint* dst_size = get_raw(dst_mem);
-    const __global float* input = (const __global float*)get_data(input_mem);
-    const __global float* filter = (const __global float*)get_data(filter_mem);
-    __global float* pDst = (__global float*)get_data(dst_mem);
-//
-
-    int global_id = get_global_id(0);
-    const int batch_num = dst_size[0];
-    const int batch_offset = global_id % dst_size[0];
-
-    const int idx = global_id / batch_num;
-
-    const int x = (idx % input_size[2]) * get_spatial(spatial_stride)[0];
-    const int y = (idx * get_spatial(spatial_stride)[1]) / input_size[2];
-
-    const int out_offset = idx * batch_num + batch_offset;
-
-    pDst[out_offset] = 0;
-    for (uint i = 0; i < filter_size[4]; i++)
-    {
-        for (uint j = 0; j < filter_size[3]; j++)
-        {
-            int input_idx = (x + j + ((y + i) * input_size[2])) * batch_num + batch_offset;
-            int filter_idx = i * filter_size[3] + j;
-            pDst[out_offset] += input[input_idx] * filter[filter_idx];
-        }
-    }
-    pDst[out_offset] += bias;
-}
-CODE_END
index 1b44664..42bd969 100644 (file)
@@ -182,9 +182,6 @@ attach_eltwise_gpu::attach_eltwise_gpu() {
          { std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), eltwise_gpu::create },
          { std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_yx_bsv16_fsv16), eltwise_gpu::create },
          // MMAD
-         { std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), eltwise_gpu::create },
-         { std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), eltwise_gpu::create },
-         { std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), eltwise_gpu::create },
          { std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), eltwise_gpu::create },
          { std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), eltwise_gpu::create },
          { std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), eltwise_gpu::create },
index 0afe4b3..069501d 100644 (file)
@@ -100,8 +100,6 @@ attach_fully_connected_gpu::attach_fully_connected_gpu() {
         {std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw},
         {std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw},
         // MMAD
-        {std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw},
-        {std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), val_fw},
         {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv32), val_fw},
         {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv32), val_fw},
         // IMAD
index c652514..1e552b9 100644 (file)
@@ -183,23 +183,13 @@ attach_fused_conv_eltwise_gpu::attach_fused_conv_eltwise_gpu() {
                                                 fused_conv_eltwise_gpu::create);
     implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
                                                 fused_conv_eltwise_gpu::create);
-    // MMAD
-    implementation_map<fused_conv_eltwise>::add(
-        std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32),
-        fused_conv_eltwise_gpu::create);
     // IMAD
-    implementation_map<fused_conv_eltwise>::add(
-        std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4),
-        fused_conv_eltwise_gpu::create);
-    implementation_map<fused_conv_eltwise>::add(
-        std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4),
-        fused_conv_eltwise_gpu::create);
-    implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32),
+    implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4),
                                                 fused_conv_eltwise_gpu::create);
-    implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32),
+    implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4),
                                                 fused_conv_eltwise_gpu::create);
     implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::image_2d_rgba),
-        fused_conv_eltwise_gpu::create);
+                                                fused_conv_eltwise_gpu::create);
 }
 
 }  // namespace detail
index 86a8322..0f38da3 100644 (file)
@@ -206,9 +206,6 @@ attach_pooling_gpu::attach_pooling_gpu() {
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create);
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create);
 
-    implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), pooling_gpu::create);
-    implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), pooling_gpu::create);
-
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), pooling_gpu::create);
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv4), pooling_gpu::create);
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), pooling_gpu::create);
index 9445834..c9eddf7 100644 (file)
@@ -113,11 +113,6 @@ attach_quantize_gpu::attach_quantize_gpu() {
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw);
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw);
 
-    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw);
-    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw);
-    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), val_fw);
-    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw);
-
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw);
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw);
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf), val_fw);
index 6b27364..7409bd4 100644 (file)
@@ -99,11 +99,7 @@ attach_resample_gpu::attach_resample_gpu() {
          {std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), resample_gpu::create},
          {std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv4), resample_gpu::create},
          {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), resample_gpu::create},
-         {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), resample_gpu::create},
-         {std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), resample_gpu::create},
-         {std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), resample_gpu::create},
-         {std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), resample_gpu::create},
-         {std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), resample_gpu::create}});
+         {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), resample_gpu::create}});
 }
 
 }  // namespace detail
index 81db4d5..8b46567 100644 (file)
@@ -127,10 +127,6 @@ bool concat_in_place_optimization::match(concatenation_node& node) {
             (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
             return false;
 
-        // TODO: If we replace byxf_af32 with byxf we can probably do this optimization, but support in kernels is required
-        if (l.format == format::byxf_af32 && (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
-            return false;
-
         if (l.format == format::bs_fs_yx_bsv16_fsv16)
             return false;
 
index 092b931..8c1f62d 100644 (file)
@@ -121,12 +121,9 @@ void prepare_padding::run(program_impl& p) {
 
         // right now output padding optimization is only available for bfyx format and data type = float32
         if (conv_layout.format != cldnn::format::bfyx &&
-            conv_layout.format != cldnn::format::bf8_xy16 &&
             conv_layout.format != cldnn::format::b_fs_yx_fsv16 &&
             conv_layout.format != cldnn::format::b_fs_zyx_fsv16 &&
             conv_layout.format != cldnn::format::bs_fs_yx_bsv16_fsv16 &&
-            conv_layout.format != cldnn::format::byxf_af32 &&
-            conv_layout.format != cldnn::format::fs_bs_yx_bsv4_fsv32 &&
             conv_layout.format != cldnn::format::b_fs_yx_fsv4 &&
             conv_layout.format != cldnn::format::fs_b_yx_fsv32 &&
             conv_layout.format != cldnn::format::b_fs_yx_32fp) {
index 4e235e9..6b2e6bd 100644 (file)
@@ -760,11 +760,8 @@ void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node*
     for (auto& dep : eltw_node->get_dependencies()) {
         format fmt = dep->get_output_layout().format;
         data_types dep_dt = dep->get_output_layout().data_type;
-        if ((fmt != format::fs_bs_yx_bsv4_fsv32 || dep_dt != data_types::i8) &&
-            (fmt != format::b_fs_yx_fsv4 || dep_dt != data_types::i8) &&
+        if ((fmt != format::b_fs_yx_fsv4 || dep_dt != data_types::i8) &&
             (fmt != format::b_fs_yx_fsv4 || dep_dt != data_types::u8) &&
-            (fmt != format::byxf_af32 || dep_dt != data_types::i8) &&
-            (fmt != format::byxf_af32 || dep_dt != data_types::u8) &&
             (fmt != format::bfyx || dep_dt != data_types::f32) && (fmt != format::bfyx || dep_dt != data_types::u8) &&
             (fmt != format::bfyx || dep_dt != data_types::i8) && (fmt != format::yxfb || dep_dt != data_types::f16) &&
             (fmt != format::bfyx || dep_dt != data_types::f16 || !if_already_depth_to_space_fused))
index ad04629..c3fbc04 100644 (file)
@@ -372,70 +372,6 @@ void reorder_inputs::run(program_impl& p, layout_optimizer& lo, reorder_factory&
     auto fmt_map = get_preferred_formats(p, lo);
     propagate_formats(p, fmt_map, lo);
     minimize_local_reorders(p, fmt_map, lo);
-
-    // WA START ============================================================================================================
-    if (lo.get_optimization_attributes().b_fs_yx_fsv16_network) {
-        // This is a temprorary work-around for known bad case until byxf_af32 handling will be corrected in layout_optimizer.
-        //
-        // Find pattern:
-        //    mvn(int8, b_fs_yx_fsv16, [x,16,1280,720]) -> conv(int8, byxf_af32, [x,3,1280,720]) -> mvn(*, bfyx) ->
-        // Replace with:
-        //    mvn(b_fs_yx_fsv16) -> conv(b_fs_yx_fsv16) -> mvn(b_fs_yx_fsv16) ->
-        //
-        // Generally for such convolution b_fs_yx_fsv16 will always perform better than byxf_af32,
-        // but to avoid unvalidated int8 b_fs_yx_fsv16 networks and potential regressions this WA is needed.
-        // Additionally reorder from af32 -> bfyx will take ~9 times longer than actual convolution.
-        for (auto& node_ptr : p.get_processing_order()) {
-            if (!node_ptr->is_in_data_flow() || !node_ptr->is_type<convolution>() || fmt_map.at(node_ptr) != format::byxf_af32)
-                continue;
-
-            auto& conv_node = node_ptr->as<convolution>();
-
-            bool input_path =
-                conv_node.input().get_output_layout().data_type == data_types::i8 &&
-                conv_node.input().is_type<mvn>() &&
-                fmt_map.at(&conv_node.input()) == format::b_fs_yx_fsv16;
-            bool output_path =
-                conv_node.get_users().size() == 1 &&
-                conv_node.get_users().front()->is_type<mvn>() &&
-                fmt_map.at(conv_node.get_users().front()) == format::bfyx &&
-                conv_node.get_users().front()->get_users().size() == 1 &&
-                !conv_node.get_users().front()->as<mvn>().get_primitive()->across_channels;
-
-            if (!input_path || !output_path)
-                continue;
-
-            auto in_lay = conv_node.input().get_output_layout();
-            auto out_lay = conv_node.get_output_layout();
-            auto wei_lay = conv_node.weights().get_output_layout();
-            bool correct_layouts =
-                // weights
-                wei_lay.data_type == data_types::i8 &&
-                wei_lay.size.spatial[0] == 3 && wei_lay.size.spatial[1] == 3 &&
-                // input/output
-                in_lay.data_type == data_types::i8 && out_lay.data_type == data_types::i8 &&
-                in_lay.size.feature[0] == 16 && out_lay.size.feature[0] == 3 &&
-                in_lay.size.spatial[0] == 1280 && out_lay.size.spatial[0] == 1280 &&
-                in_lay.size.spatial[1] == 720 && out_lay.size.spatial[1] == 720;
-
-            if (!correct_layouts)
-                continue;
-
-            bool correct_conv =
-                conv_node.get_groups() == 1 && conv_node.get_split() == 1 && conv_node.get_deformable_groups() == 1 &&
-                !conv_node.get_depthwise_sep_opt() && !conv_node.get_transposed() &&
-                !conv_node.activations_zero_points_term() && !conv_node.weights_zero_points_term() && !conv_node.compensation_term() &&
-                conv_node.get_primitive()->dilation == tensor(1);
-
-            if (!correct_conv)
-                continue;
-
-            fmt_map.at(node_ptr) = format::b_fs_yx_fsv16;
-            fmt_map.at(conv_node.get_users().front()) = format::b_fs_yx_fsv16;
-        }
-    }
-    // WA END ==============================================================================================================
-
     insert_reorders(p, fmt_map, rf);
 
     for (auto n : p.get_processing_order()) {
index b52fbd3..561fae4 100644 (file)
@@ -85,16 +85,8 @@ inline std::string fmt_to_str(format fmt) {
             return "bs_xs_xsv8_bsv16";
         case format::bs_x_bsv16:
             return "bs_x_bsv16";
-        case format::bf8_xy16:
-            return "bf8_xy16";
         case format::winograd_2x3_s1_data:
             return "winograd_2x3_s1_data";
-        case format::byxf_af32:
-            return "byxf_af32";
-        case format::byx8_f4:
-            return "byx8_f4";
-        case format::fs_bs_yx_bsv4_fsv32:
-            return "fs_bs_yx_bsv4_fsv32";
         case format::b_fs_yx_fsv4:
             return "b_fs_yx_fsv4";
         case format::b_fs_yx_32fp:
index 6dddbca..c4f9daf 100644 (file)
@@ -126,16 +126,8 @@ kernel_selector::data_layout to_data_layout(format f) {
             return kernel_selector::data_layout::bs_f_bsv8__af8;
         case format::bs_xs_xsv8_bsv16:
             return kernel_selector::data_layout::bs_f_bsv16__af8;
-        case format::bf8_xy16:
-            return kernel_selector::data_layout::bf8_xy16;
         case format::winograd_2x3_s1_data:
             return kernel_selector::data_layout::winograd_2x3_s1_data;
-        case format::byxf_af32:
-            return kernel_selector::data_layout::byxf_af32;
-        case format::byx8_f4:
-            return kernel_selector::data_layout::byx8_f4;
-        case format::fs_bs_yx_bsv4_fsv32:
-            return kernel_selector::data_layout::fs_bs_yx_bsv4_fsv32;
         case format::b_fs_yx_fsv4:
             return kernel_selector::data_layout::b_fs_yx_fsv4;
         case format::b_fs_yx_32fp:
@@ -185,16 +177,8 @@ cldnn::format from_data_layout(kernel_selector::data_layout l) {
             return cldnn::format::bs_xs_xsv8_bsv8;
         case kernel_selector::data_layout::bs_f_bsv16__af8:
             return cldnn::format::bs_x_bsv16;
-        case kernel_selector::data_layout::bf8_xy16:
-            return cldnn::format::bf8_xy16;
         case kernel_selector::data_layout::winograd_2x3_s1_data:
             return cldnn::format::winograd_2x3_s1_data;
-        case kernel_selector::data_layout::byxf_af32:
-            return cldnn::format::byxf_af32;
-        case kernel_selector::data_layout::byx8_f4:
-            return cldnn::format::byx8_f4;
-        case kernel_selector::data_layout::fs_bs_yx_bsv4_fsv32:
-            return cldnn::format::fs_bs_yx_bsv4_fsv32;
         case kernel_selector::data_layout::b_fs_yx_32fp:
             return cldnn::format::b_fs_yx_32fp;
         case kernel_selector::data_layout::bfzyx:
@@ -510,20 +494,9 @@ kernel_selector::data_tensor convert_data_tensor(const layout& l, uint32_t split
     size_t pitch = 1;
     auto new_vals = vals;
 
-    if (ks_layout == kernel_selector::Tensor::byxf_af32) {
-        new_vals[3] = align_to(vals[3], 32);
-    }
     if (ks_layout == kernel_selector::Tensor::b_fs_yx_fsv32) {
         new_vals[1] = align_to(vals[1], 32);
     }
-    if (ks_layout == kernel_selector::Tensor::fs_bs_yx_bsv4_fsv32) {
-        new_vals[3] = align_to(vals[3], 32);
-        new_vals[2] = align_to(vals[2], 4);
-    }
-    if (ks_layout == kernel_selector::Tensor::byx8_f4) {
-        new_vals[3] = align_to(vals[3], 4);
-        new_vals[2] = align_to(vals[2], 8);
-    }
     if (ks_layout == kernel_selector::Tensor::bs_fs_yx_bsv16_fsv16) {
         new_vals[0] = align_to(vals[0], 16);
         new_vals[1] = align_to(vals[1], 16);
index 04bb824..a44e076 100644 (file)
@@ -183,22 +183,13 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
     if (next.is_type<fully_connected>() &&
         (fmt_prev == format::bfyx || fmt_prev == format::yxfb ||
          fmt_prev == format::b_fs_yx_fsv16 || fmt_prev == format::fs_b_yx_fsv32 ||
-         fmt_prev == format::byxf_af32 || fmt_prev == format::b_fs_yx_fsv32 ||
+         fmt_prev == format::b_fs_yx_fsv32 ||
          (fmt_prev == format::b_fs_yx_fsv4 &&
           prev_output_layout.size.feature[0] % 32 == 0 &&
           prev_output_layout.size.spatial[0] == 1 &&
           prev_output_layout.size.spatial[1] == 1)))
         return true;
 
-    if (next.is_type<convolution>() && fmt_prev == format::byxf_af32 && fmt_next == format::b_fs_yx_fsv4 && next.as<convolution>().get_groups() != 1)
-        return true;
-
-    if (next.is_type<convolution>() && fmt_prev == format::byxf_af32 && fmt_next == format::bfyx)
-        return true;
-
-    if (next.is_type<convolution>() && fmt_prev == format::b_fs_yx_fsv4 && fmt_next == format::byxf_af32 && next.as<convolution>().get_groups() == 1)
-        return true;
-
     if (next.is_type<convolution>() && fmt_prev == format::b_fs_yx_fsv16 && fmt_next == format::b_fs_yx_fsv4 && is_input_idx(0))
         return true;
 
@@ -232,7 +223,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
 
     if (next.is_type<convolution>() &&
         (fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::bfyx)  && prev_output_layout.size.feature[0] == 3 &&
-        (fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::byxf_af32 ||
+        (fmt_next == format::b_fs_yx_fsv4 ||
          fmt_next == format::bs_fs_yx_bsv16_fsv16))
         return true;
 
@@ -727,23 +718,6 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
         // nothing to do, just go out from here.
     } else if (layout_optimizer::convolution_bfyx_opt(current_layout, output_or_weights_layout, prim) ||
                 (_output_size_handling_enabled && prim->with_output_size) || node.get_transposed()) {
-        // commented out due to performance reasons, maybe enable in future
-        /*if (current_layout.data_type == data_types::f32 &&
-        current_layout.size.batch[0] % 16 == 0 &&
-        current_layout.format == format::bfyx &&
-        output_or_weights_layout.size.spatial[0] == 1 && output_or_weights_layout.size.spatial[1] == 1 &&
-        prim->stride.spatial[0] == 1 && prim->stride.spatial[1] == 1 &&
-        prim->input_offset.spatial[0] == 0 && prim->input_offset.spatial[1] == 0 &&
-        !node.get_transposed())
-    {
-        if (!((current_layout.size.feature[0] % 8) == 0 && (current_layout.size.spatial[0] *
-    current_layout.size.spatial[1]) == 16 && current_layout.data_padding == padding{ { 0,0,0,0 }, 0 }))
-        {
-            expected_tensor = current_layout.size.transform(cldnn::format::bf8_xy16, 1);
-            expected_format = cldnn::format::bf8_xy16;
-        }
-    }
-    else*/
         {
             expected_tensor = current_layout.size;
             if (current_layout.format == format::b_fs_zyx_fsv16 || current_layout.format == format::bs_fs_zyx_bsv16_fsv16)
index fc6cec3..290c5e2 100644 (file)
@@ -254,8 +254,7 @@ memory_impl::ptr memory_pool::get_from_padded_pool(const layout& layout,
                 ((layout.format != format::b_fs_yx_fsv32 && layout.format != format::b_fs_zyx_fsv32) ||
                  (layout.size.feature[0] % 32 == 0)) &&
                 // TODO: check if this condition always correct
-                ((layout.format == format::byxf_af32 && layout.size.feature[0] == rec_list._memory->get_layout().size.feature[0]) ||
-                 (layout.format != format::byxf_af32 && layout.size.feature[0] <= rec_list._memory->get_layout().size.feature[0])) &&
+                layout.size.feature[0] <= rec_list._memory->get_layout().size.feature[0] &&
                 layout.size.batch[0] <= rec_list._memory->get_layout().size.batch[0] &&
                 rec_list._memory->get_layout().format != format::fs_b_yx_fsv32 &&
                 layout.format != format::fs_b_yx_fsv32 &&
index 2ff72f9..a99f1c8 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2018-2019 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -142,9 +142,7 @@ std::pair<bool, bool> program_helpers::are_layouts_identical(layout const& l1, l
         return {false, false};
     if (l1.get_linear_size() != l2.get_linear_size())
         return {false, false};
-    if ((l1.format == format::bf8_xy16 && l2.format != format::bf8_xy16) ||
-        (l2.format == format::bf8_xy16 && l1.format != format::bf8_xy16) ||
-        (l1.format == format::b_fs_yx_fsv4 && l2.format != format::b_fs_yx_fsv4) ||
+    if ((l1.format == format::b_fs_yx_fsv4 && l2.format != format::b_fs_yx_fsv4) ||
         (l2.format == format::b_fs_yx_fsv4 && l1.format != format::b_fs_yx_fsv4) ||
         (l1.format == format::fs_b_yx_fsv32 && l2.format != format::fs_b_yx_fsv32) ||
         (l2.format == format::fs_b_yx_fsv32 && l1.format != format::fs_b_yx_fsv32) ||
index 440f2c1..f245b05 100644 (file)
@@ -6873,7 +6873,7 @@ TEST(convolution_depthwise_gpu_fsv16, depthwise_conv_b_fs_yx_fsv16_in_feature_pa
     build_options options;
     options.set_option(build_option::optimize_data(true));
     implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" };
-    options.set_option(build_option::force_implementations({ {"conv", conv_impl} }));  
+    options.set_option(build_option::force_implementations({ {"conv", conv_impl} }));
 
     network network(engine, topology, options);
     network.set_input_data("input", input);
@@ -6893,7 +6893,7 @@ TEST(convolution_depthwise_gpu_fsv16, depthwise_conv_b_fs_yx_fsv16_in_feature_pa
 
     EXPECT_EQ(output_layout.format, format::bfyx);
 
-    EXPECT_EQ(y_size, output_size.spatial[1]);    
+    EXPECT_EQ(y_size, output_size.spatial[1]);
     EXPECT_EQ(x_size, output_size.spatial[0]);
     EXPECT_EQ(f_size, output_size.feature[0]);
     EXPECT_EQ(b_size, output_size.batch[0]);
@@ -7945,8 +7945,6 @@ INSTANTIATE_TEST_CASE_P(
         .all_test_params(format::bfyx, false, true)
         .all_test_params(format::bfyx, true, false)
         .all_test_params(format::b_fs_yx_fsv4)
-        // byxf_af32 - depthwise broken for batch > 1
-        // .smoke_test_params(format::byxf_af32)
         .all_test_params(format::b_fs_yx_fsv32)
         .all_test_params(format::b_fs_yx_fsv32, true, true)
         .all_test_params(format::b_fs_yx_fsv32, false, true)
index 40b9fd6..5e0310b 100644 (file)
@@ -1336,7 +1336,7 @@ INSTANTIATE_TEST_CASE_P(
         testing::Values(1, 3),
         testing::Values(1, 3),
         testing::Values(3, 32),
-        testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
+        testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
     ),
     fully_connected_i8_i8_test::PrintToStringParamName
 );
@@ -1350,7 +1350,7 @@ INSTANTIATE_TEST_CASE_P(
         testing::Values(1, 3),
         testing::Values(1, 3),
         testing::Values(3, 32),
-        testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
+        testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
     ),
     fully_connected_i8_u8_test::PrintToStringParamName
 );
@@ -1364,7 +1364,7 @@ INSTANTIATE_TEST_CASE_P(
         testing::Values(1, 3),
         testing::Values(1, 3),
         testing::Values(3, 32),
-        testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
+        testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
     ),
     fully_connected_i8_f32_test::PrintToStringParamName
 );
@@ -1378,7 +1378,7 @@ INSTANTIATE_TEST_CASE_P(
         testing::Values(1, 3),
         testing::Values(1, 3),
         testing::Values(3, 32),
-        testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
+        testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
     ),
     fully_connected_u8_i8_test::PrintToStringParamName
 );
@@ -1392,7 +1392,7 @@ INSTANTIATE_TEST_CASE_P(
         testing::Values(1, 3),
         testing::Values(1, 3),
         testing::Values(3, 32),
-        testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
+        testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
     ),
     fully_connected_u8_u8_test::PrintToStringParamName
 );
@@ -1406,7 +1406,7 @@ INSTANTIATE_TEST_CASE_P(
         testing::Values(1, 3),
         testing::Values(1, 3),
         testing::Values(3, 32),
-        testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
+        testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
     ),
     fully_connected_u8_f32_test::PrintToStringParamName
 );
index 91454f1..1ffbf11 100644 (file)
@@ -117,7 +117,6 @@ TEST(fused_conv_eltwise, basic_image2d)
     EXPECT_EQ(outputs_act.begin()->first, "eltwise");
 
     auto output_act = outputs_act.begin()->second.get_memory();
-    auto&& out_act_layout = output_act.get_layout();
     auto out_act_ptr = output_act.pointer<uint8_t>();
 
     topology topology_ref(
@@ -140,7 +139,6 @@ TEST(fused_conv_eltwise, basic_image2d)
     EXPECT_EQ(outputs_ref.begin()->first, "out");
 
     auto output_ref = outputs_ref.begin()->second.get_memory();
-    auto&& out_ref_layout = output_ref.get_layout();
     auto out_ref_ptr = output_ref.pointer<uint8_t>();
 
     for (int i = 0;i < 3 * 256 * 4;i++) {
index a436d31..1a62fab 100644 (file)
@@ -1373,58 +1373,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_shift_swish,
                                 bc_test_params{CASE_CONV3D_S8S8_4, 2, 6},
                         }), );
 
-
-class conv_int8_byxf_af32 : public ConvFusingTest {};
-TEST_P(conv_int8_byxf_af32, per_channel_coeffs) {
-    auto p = GetParam();
-    create_topologies(input_layout("input", get_input_layout(p)),
-                 data("weights", get_mem(get_weights_layout(p))),
-                 data("bias", get_mem(get_bias_layout(p))),
-                 data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)),
-                 convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
-                 scale("scale", "conv_prim", "scale_data"),
-                 reorder("reorder_bfyx", "scale", p.default_format, data_types::f32)
-    );
-
-    implementation_desc conv_impl = { format::byxf_af32, "" };
-    bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
-
-    tolerance = 1e-5f;
-    execute(p);
-}
-
-TEST_P(conv_int8_byxf_af32, per_element_coeffs) {
-    auto p = GetParam();
-    create_topologies(input_layout("input", get_input_layout(p)),
-                 data("weights", get_mem(get_weights_layout(p))),
-                 data("bias", get_mem(get_bias_layout(p))),
-                 data("eltwise_data", get_mem(get_output_layout(p))),
-                 convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
-                 eltwise("eltwise", "conv_prim", "eltwise_data", eltwise_mode::sum),
-                 reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32)
-    );
-
-    implementation_desc conv_impl = { format::byxf_af32, "" };
-    bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
-
-    tolerance = 1e-5f;
-    execute(p);
-}
-
-INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_byxf_af32,
-                        ::testing::ValuesIn(std::vector<bc_test_params>{
-                                bc_test_params{CASE_CONV_U8S8_1, 2, 3},
-                                bc_test_params{CASE_CONV_U8S8_2, 2, 3},
-                                bc_test_params{CASE_CONV_U8S8_3, 2, 3},
-                                bc_test_params{CASE_CONV_U8S8_4, 2, 3},
-                                bc_test_params{CASE_CONV_U8S8_6, 2, 3},
-                                bc_test_params{CASE_CONV_S8S8_1, 2, 3},
-                                bc_test_params{CASE_CONV_S8S8_2, 2, 3},
-                                bc_test_params{CASE_CONV_S8S8_3, 2, 3},
-                                bc_test_params{CASE_CONV_S8S8_4, 2, 3},
-                                bc_test_params{CASE_CONV_S8S8_6, 2, 3},
-                        }), );
-
 class conv_int8_prelu_eltwise : public ConvFusingTest {};
 TEST_P(conv_int8_prelu_eltwise, basic) {
     auto p = GetParam();
@@ -3452,7 +3400,7 @@ struct activation_test_params {
 #define CASE_ACTIVATION_F32_2 {7, 3, 7, 7}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
 #define CASE_ACTIVATION_F32_3 {1, 14, 8, 8}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
 #define CASE_ACTIVATION_F32_4 {1, 17, 31, 29}, data_types::f32, format::yxfb, data_types::f32, format::bfyx
-#define CASE_ACTIVATION_F32_5 {1, 17, 31, 29}, data_types::f32, format::byxf_af32, data_types::f32, format::bfyx
+#define CASE_ACTIVATION_F32_5 {1, 17, 31, 29}, data_types::f32, format::b_fs_yx_fsv4, data_types::f32, format::bfyx
 #define CASE_ACTIVATION_F32_6 {1, 17, 31, 29}, data_types::f32, format::b_fs_yx_fsv32, data_types::f32, format::bfyx
 #define CASE_ACTIVATION_F32_7 {1, 17, 31, 29}, data_types::f32, format::fyxb, data_types::f32, format::bfyx
 #define CASE_ACTIVATION_3D_F32_0 {3, 16, 13, 13, 13}, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
@@ -3461,14 +3409,13 @@ struct activation_test_params {
 #define CASE_ACTIVATION_3D_F32_3 {1, 17, 7, 7, 7}, data_types::f32, format::b_fs_zyx_fsv32, data_types::f32, format::bfzyx
 #define CASE_ACTIVATION_3D_F32_4 {1, 17, 7, 7, 7}, data_types::f32, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfzyx
 #define CASE_ACTIVATION_3D_F32_5 {1, 17, 7, 7, 7}, data_types::f32, format::fs_b_yx_fsv32, data_types::f32, format::bfzyx
-#define CASE_ACTIVATION_3D_F32_6 {1, 17, 7, 7, 7}, data_types::f32, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfzyx
 
 #define CASE_ACTIVATION_F16_0 {7, 32, 5, 5}, data_types::f16, format::bfyx, data_types::f32, format::bfyx
 #define CASE_ACTIVATION_F16_1 {1, 16, 8, 8}, data_types::f16, format::bfyx, data_types::f32, format::bfyx
 #define CASE_ACTIVATION_F16_2 {7, 16, 7, 7}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
 #define CASE_ACTIVATION_F16_3 {1, 14, 8, 8}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
 #define CASE_ACTIVATION_F16_4 {1, 17, 31, 29}, data_types::f16, format::yxfb, data_types::f32, format::bfyx
-#define CASE_ACTIVATION_F16_5 {1, 17, 31, 29}, data_types::f16, format::byxf_af32, data_types::f32, format::bfyx
+#define CASE_ACTIVATION_F16_5 {1, 17, 31, 29}, data_types::f16, format::b_fs_yx_fsv4, data_types::f32, format::bfyx
 #define CASE_ACTIVATION_F16_6 {1, 17, 31, 29}, data_types::f16, format::b_fs_yx_fsv32, data_types::f32, format::bfyx
 #define CASE_ACTIVATION_F16_7 {1, 17, 31, 29}, data_types::f16, format::fyxb, data_types::f32, format::bfyx
 #define CASE_ACTIVATION_3D_F16_0 {3, 16, 13, 13, 13}, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
@@ -3477,7 +3424,6 @@ struct activation_test_params {
 #define CASE_ACTIVATION_3D_F16_3 {1, 17, 7, 7, 7}, data_types::f16, format::b_fs_zyx_fsv32, data_types::f32, format::bfzyx
 #define CASE_ACTIVATION_3D_F16_4 {1, 17, 7, 7, 7}, data_types::f16, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfzyx
 #define CASE_ACTIVATION_3D_F16_5 {1, 17, 7, 7, 7}, data_types::f16, format::fs_b_yx_fsv32, data_types::f32, format::bfzyx
-#define CASE_ACTIVATION_3D_F16_6 {1, 17, 7, 7, 7}, data_types::f16, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfzyx
 
 #define CASE_ACTIVATION_U8_1 {1, 16, 8, 8}, data_types::u8, format::bfyx, data_types::f32, format::bfyx
 #define CASE_ACTIVATION_U8_2 {1, 12, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
@@ -3572,7 +3518,6 @@ INSTANTIATE_TEST_CASE_P(
         activation_test_params{CASE_ACTIVATION_F32_7, 2, 3, "activation_ref"},     // FIXME - accuracy bug
         activation_test_params{CASE_ACTIVATION_3D_F32_3, 2, 3, "activation_ref"},  // FIXME - accuracy bug
         activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 3, "activation_ref"},  // FIXME - accuracy bug
-        activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 3, "activation_ref"},  // FIXME - accuracy bug
     }), );
 
 class activation_scale_activation_quantize_u8 : public ActivationFusingTest {};
@@ -3640,7 +3585,6 @@ INSTANTIATE_TEST_CASE_P(
     activation_scale_activation_quantize_u8,
     ::testing::ValuesIn(std::vector<activation_test_params>{
         activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 5, "activation_ref"},  // FIXME - accuracy bug
-        activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 5, "activation_ref"},  // FIXME - accuracy bug
     }), );
 
 class activation_scale_activation : public ActivationFusingTest {};
@@ -3697,8 +3641,7 @@ INSTANTIATE_TEST_CASE_P(
         activation_test_params{CASE_ACTIVATION_3D_F16_1, 2, 4, "activation_ref"},
         activation_test_params{CASE_ACTIVATION_3D_F16_2, 2, 4, "activation_ref"},
         activation_test_params{CASE_ACTIVATION_3D_F16_3, 2, 4, "activation_ref"},
-          activation_test_params{CASE_ACTIVATION_3D_F16_4, 2, 4, "activation_ref"},
-          activation_test_params{CASE_ACTIVATION_3D_F16_5, 2, 4, "activation_ref"},
+        activation_test_params{CASE_ACTIVATION_3D_F16_4, 2, 4, "activation_ref"},
 
         // InputDataType = UINT8
         activation_test_params{CASE_ACTIVATION_U8_1, 2, 4, "activation_ref"},
@@ -3719,8 +3662,6 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::ValuesIn(std::vector<activation_test_params>{
         activation_test_params{CASE_ACTIVATION_3D_F32_4, 2, 4, "activation_ref"},  // FIXME - accuracy bug
         activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 4, "activation_ref"},  // FIXME - accuracy bug
-        activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 4, "activation_ref"},  // FIXME - accuracy bug
-        activation_test_params{CASE_ACTIVATION_3D_F16_6, 2, 4, "activation_ref"},  // FIXME - accuracy bug
     }), );
 
 /* ----------------------------------------------------------------------------------------------------- */
@@ -4450,30 +4391,21 @@ struct pooling_test_params {
 #define CASE_POOLING_U8_1 {1, 16, 8, 8}, data_types::u8, format::bfyx, data_types::f32, format::bfyx
 #define CASE_POOLING_U8_2 {2, 16, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
 #define CASE_POOLING_U8_3 {1, 32, 10, 10}, data_types::u8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4
-#define CASE_POOLING_U8_4 {1, 32, 10, 10}, data_types::u8, format::byxf_af32, data_types::f32, format::bfyx
 #define CASE_POOLING_U8_5 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx
 #define CASE_POOLING_U8_6 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx
 
 #define CASE_POOLING_U8_FP16_3 {1, 32, 10, 10}, data_types::u8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4
-#define CASE_POOLING_U8_FP16_4 {1, 32, 10, 10}, data_types::u8, format::byxf_af32, data_types::f16, format::bfyx
 #define CASE_POOLING_U8_FP16_5 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx
 #define CASE_POOLING_U8_FP16_6 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx
 
 #define CASE_POOLING_I8_1 {1, 16, 8, 8}, data_types::i8, format::bfyx, data_types::f32, format::bfyx
 #define CASE_POOLING_I8_2 {2, 16, 8, 8}, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
-#define CASE_POOLING_I8_4 {1, 32, 10, 10}, data_types::i8, format::byxf_af32, data_types::f32, format::bfyx
 #define CASE_POOLING_I8_5 {1, 32, 10, 10}, data_types::i8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4
 #define CASE_POOLING_I8_6 {16, 32, 10, 10, 10}, data_types::i8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx
 
-#define CASE_POOLING_I8_FP16_4 {1, 32, 10, 10}, data_types::i8, format::byxf_af32, data_types::f16, format::bfyx
 #define CASE_POOLING_I8_FP16_5 {1, 32, 10, 10}, data_types::i8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4
 #define CASE_POOLING_I8_FP16_6 {16, 32, 10, 10, 10}, data_types::i8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx
 
-// Disabled
-#define CASE_POOLING_I8_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfyx
-#define CASE_POOLING_I8_FP16_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f16, format::bfyx
-#define CASE_POOLING_I8_FP16_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f16, format::bfyx
-
 class PoolingFusingTest : public ::BaseFusingTest<pooling_test_params> {
 public:
     void execute(pooling_test_params& p) {
@@ -4674,8 +4606,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
                             pooling_test_params{CASE_POOLING_F32_10, 2, 5, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
 
                             // Input type: INT8
-                            pooling_test_params{CASE_POOLING_I8_4, 2, 5, pooling_mode::average, "pooling_gpu_byxf_af32"},
-                            pooling_test_params{CASE_POOLING_I8_4, 2, 5, pooling_mode::max, "pooling_gpu_byxf_af32"},
                             pooling_test_params{CASE_POOLING_I8_5, 2, 5, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
                             pooling_test_params{CASE_POOLING_I8_5, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
                             pooling_test_params{CASE_POOLING_I8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"},
@@ -4688,8 +4618,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
                             pooling_test_params{CASE_POOLING_U8_3, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
                             pooling_test_params{CASE_POOLING_U8_5, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"},
                             pooling_test_params{CASE_POOLING_U8_5, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"},
-                            pooling_test_params{CASE_POOLING_U8_4, 2, 5, pooling_mode::average, "pooling_gpu_byxf_af32"},
-                            pooling_test_params{CASE_POOLING_U8_4, 2, 5, pooling_mode::max, "pooling_gpu_byxf_af32"},
                             pooling_test_params{CASE_POOLING_U8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"},
                             pooling_test_params{CASE_POOLING_U8_6, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"},
                         }), );
@@ -4697,9 +4625,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
 INSTANTIATE_TEST_CASE_P(DISABLED_fusings_gpu,
                          pooling_scale_activation_quantize,
                          ::testing::ValuesIn(std::vector<pooling_test_params>{
-                            pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"},
-                            pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
-                            pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::average, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
                             pooling_test_params{CASE_POOLING_F32_3, 2, 5, pooling_mode::average, "pooling_gpu_average_opt"},  //currently not enabled, fusing not upported
                         }), );
 
@@ -4742,8 +4667,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
                             pooling_test_params{CASE_POOLING_F32_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
 
                             // Input type: INT8
-                            pooling_test_params{CASE_POOLING_I8_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
-                            pooling_test_params{CASE_POOLING_I8_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
                             pooling_test_params{CASE_POOLING_I8_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
                             pooling_test_params{CASE_POOLING_I8_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
                             pooling_test_params{CASE_POOLING_I8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
@@ -4754,8 +4677,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
                             pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
                             pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
                             pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
-                            pooling_test_params{CASE_POOLING_U8_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
-                            pooling_test_params{CASE_POOLING_U8_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
                             pooling_test_params{CASE_POOLING_U8_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
                             pooling_test_params{CASE_POOLING_U8_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
                             pooling_test_params{CASE_POOLING_U8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
@@ -4822,8 +4743,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
                             pooling_test_params{CASE_POOLING_F32_F16_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
 
                             // Input type: INT8
-                            pooling_test_params{CASE_POOLING_I8_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
-                            pooling_test_params{CASE_POOLING_I8_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
                             pooling_test_params{CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
                             pooling_test_params{CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
                             pooling_test_params{CASE_POOLING_I8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
@@ -4834,24 +4753,12 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
                             pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
                             pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
                             pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
-                            pooling_test_params{CASE_POOLING_U8_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
-                            pooling_test_params{CASE_POOLING_U8_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
                             pooling_test_params{CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
                             pooling_test_params{CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
                             pooling_test_params{CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
                             pooling_test_params{CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
                      }), );
 
-INSTANTIATE_TEST_CASE_P(DISABLED_fusings_gpu,
-                        pooling_scale_activation,
-                        ::testing::ValuesIn(std::vector<pooling_test_params>{
-                            pooling_test_params{CASE_POOLING_I8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"},
-                            pooling_test_params{CASE_POOLING_I8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
-                            pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"},
-                            pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
-                            pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::average, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
-                     }), );
-
 /* ----------------------------------------------------------------------------------------------------- */
 /* -------------------------------- DepthToSpace cases ------------------------------------------------- */
 /* ----------------------------------------------------------------------------------------------------- */
index 2ac9813..4712bc3 100644 (file)
@@ -2619,7 +2619,6 @@ INSTANTIATE_TEST_CASE_P(
                      testing::Values(std::tuple<int, int, int>(0, 0, 0)),
                      testing::Values(format::yxfb,
                                      format::bfyx,
-                                     format::byxf_af32,
                                      format::b_fs_yx_fsv4,
                                      format::b_fs_yx_fsv16,
                                      format::b_fs_yx_fsv32)),
@@ -2737,8 +2736,7 @@ INSTANTIATE_TEST_CASE_P(
                                      format::b_fs_yx_fsv16,
                                      format::fs_b_yx_fsv32,
                                      format::b_fs_yx_fsv32,
-                                     format::b_fs_yx_fsv4,
-                                     format::fs_bs_yx_bsv4_fsv32)),
+                                     format::b_fs_yx_fsv4)),
     testing::internal::DefaultParamName<pooling_random_test_params>);
 
 TEST(pooling_forward_gpu, bsv16_fsv16_max_16x16x8x8_input_2x2_pool_2x2_stride)
index d06f938..f6c6dd4 100644 (file)
@@ -766,8 +766,6 @@ INSTANTIATE_TEST_CASE_P(smoke,
                         resample_random_test,
                         testing::ValuesIn(
                             resample_random_test_param_generator()
-                            .smoke_params(data_types::i8, format::byxf_af32, format::byxf_af32)
-                            .smoke_params(data_types::u8, format::byxf_af32, format::byxf_af32)
                             .smoke_params(data_types::i8, format::b_fs_yx_fsv4, format::b_fs_yx_fsv4)
                             .smoke_params(data_types::u8, format::b_fs_yx_fsv4, format::b_fs_yx_fsv4)
                             .smoke_params(data_types::i8, format::b_fs_yx_fsv16, format::b_fs_yx_fsv16)
index 307baa9..8f6200b 100644 (file)
@@ -105,16 +105,4 @@ TEST(tensor_api, linear_offsets) {
     test_tensor_offset({ 2, 19, 4, 3 }, { 1, 18, 3, 2 }, cldnn::format::b_fs_yx_fsv16, 754);
     test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::fs_b_yx_fsv32, 675);
     test_tensor_offset({ 2, 37, 4, 3 }, { 1, 35, 3, 2 }, cldnn::format::fs_b_yx_fsv32, 1507);
-
-    // Formats with alignment:
-    test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::byxf_af32, 675);
-    test_tensor_offset({ 2, 37, 4, 3 }, { 1, 35, 3, 2 }, cldnn::format::byxf_af32, 1507);
-    test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::byx8_f4, 331);
-    test_tensor_offset({ 2, 37, 4, 3 }, { 1, 35, 3, 2 }, cldnn::format::byx8_f4, 1755);
-
-    // Non-standard blocked formats:
-    // bf8_xy16 - b_fs_es_fsv8_esv16, where e is flattened yx := x + y * size_x
-    test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::bf8_xy16, 185);
-    test_tensor_offset({ 2, 19, 7, 3 }, { 1, 18, 3, 2 }, cldnn::format::bf8_xy16, 1441);
-
 }