[IE CLDNN] Add 3d spatials support to conv & pool imad kernels and unblock any in...
authorJedrzej Hajduczenia <jedrzej.hajduczenia@intel.com>
Thu, 20 Aug 2020 11:54:54 +0000 (13:54 +0200)
committerGitHub <noreply@github.com>
Thu, 20 Aug 2020 11:54:54 +0000 (14:54 +0300)
28 files changed:
inference-engine/thirdparty/clDNN/api/tensor.hpp
inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.cpp
inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_simple_ref.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_zyx_fsv16_imad.cpp [moved from inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad.cpp with 72% similarity]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_zyx_fsv16_imad.h [moved from inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad.h with 82% similarity]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_zyx_fsv16_imad.cpp [moved from inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16_imad.cpp with 72% similarity]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_zyx_fsv16_imad.h [moved from inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16_imad.h with 85% similarity]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_yx_fsv16_imad.cl [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_zyx_fsv16_imad.cl [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv16_imad.cl [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_zyx_fsv16_imad.cl [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_weights.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp
inference-engine/thirdparty/clDNN/src/convolution.cpp
inference-engine/thirdparty/clDNN/src/gpu/convolution_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/scale_gpu.cpp
inference-engine/thirdparty/clDNN/src/graph_optimizer/pre_replace_deconv.cpp
inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_padding.cpp
inference-engine/thirdparty/clDNN/src/include/to_string_utils.h
inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp
inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
inference-engine/thirdparty/clDNN/src/pooling.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp

index cc2ddec..f36c260 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -170,6 +170,7 @@ struct format {
         os_is_y_x8_osv8_isv4,                         ///< format for weights for 1x1 MMAD convolutions
         os_is_y_x8_osv8_isv4_swizzled_by_4,           ///< format for weights for 1x1 MMAD convolutions
         os_is_yx_osv16_isv4,                          ///< format for weights for IMAD convolutions
+        os_is_zyx_osv16_isv16,                        ///< format for weights for IMAD convolutions
         os_is_yx_osv32_isv4_swizzled_by_2,            ///< format for weights for IMAD convolutions
         os_is_yx_osv32_isv4,                          ///< format for weights for IMAD convolutions
         os_is_yx_osv32_isv32p,                        ///< format for weights for binary convolutions
@@ -188,6 +189,7 @@ struct format {
         gs_oiyx_gsv32,                                ///< format used for weights for 2D convolution
         g_is_os_zyx_osv16_isv16,                      ///< format used for grouped weights for blocked 3D deconvolution
         g_os_is_yx_osv16_isv4,
+        g_os_is_zyx_osv16_isv16,
         g_is_os_yx_osv16_isv16,
         g_os_is_zyx_isv8_osv16_isv2,
         g_os_is_yx_isv8_osv16_isv2,
@@ -237,7 +239,7 @@ struct format {
                 { b_fs_yx_32fp,          { 1, 1, 2, 0, 0, "bfyx",   "bfxy?",  {}}},
                 { b_fs_zyx_fsv16,        { 1, 1, 3, 0, 0, "bfzyx",  "bfxyz",  {{1, 16}}}},
                 { bs_fs_zyx_bsv16_fsv16, { 1, 1, 3, 0, 0, "bfzyx",  "bfxyz",  {{0, 16 }, {1, 16}}}},
-                { bs_fs_yx_bsv16_fsv16,  { 1, 1, 3, 0, 0, "bfyx",   "bfxy?",  {{0, 16 }, {1, 16}}}},
+                { bs_fs_yx_bsv16_fsv16,  { 1, 1, 2, 0, 0, "bfyx",   "bfxy?",  {{0, 16 }, {1, 16}}}},
                 { nv12,                  { 1, 1, 2, 0, 0, "bfyx",   "bfxy?",  {}}},
                 { image_2d_rgba,         { 1, 1, 2, 0, 0, "bfyx",   "bfxy?",  {}}},
 
@@ -265,7 +267,8 @@ struct format {
                 { is_o32_yx_isv32_swizzled_by_4,               { 1, 1, 2, 0, 0, "byxf",   "bfxy?",      {}}},
                 { os_is_y_x8_osv8_isv4,                        { 1, 1, 2, 0, 0, "byxf",   "bfxy?",      {}}},
                 { os_is_y_x8_osv8_isv4_swizzled_by_4,          { 1, 1, 2, 0, 0, "byxf",   "bfxy?",      {}}},
-                { os_is_yx_osv16_isv4,                         { 1, 1, 2, 0, 0, "bfxy",   "bfxy?",      {{0, 16}, {1, 4}}}},
+                { os_is_yx_osv16_isv4,                         { 1, 1, 2, 0, 0, "bfyx",   "bfxy?",      {{0, 16}, {1, 4}}}},
+                { os_is_zyx_osv16_isv16,                       { 1, 1, 3, 0, 0, "bfzyx",  "bfxyz",      {{0, 16}, {1, 16}}}},
                 { os_is_yx_osv32_isv4_swizzled_by_2,           { 1, 1, 2, 0, 0, "bfxy",   "bfxy?",      {{0, 32}, {1, 4}}}},
                 { os_is_yx_osv32_isv4,                         { 1, 1, 2, 0, 0, "bfxy",   "bfxy?",      {{0, 32}, {1, 4}}}},
                 { os_is_yx_osv32_isv32p,                       { 1, 1, 1, 0, 0, "bfxy",   "bfxy?",      {}}},
@@ -291,7 +294,8 @@ struct format {
                 { g_os_is_zyx_isv8_osv16_isv2,                 { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g",  {{1, 8}, {0, 16}, {1, 2}}}},
                 { g_os_is_yx_isv8_osv16_isv2,                  { 1, 1, 2, 0, 1, "gbfyx",  "bfxy????g",  {{1, 8}, {0, 16}, {1, 2}}}},
                 { g_os_is_zyx_isv16_osv16,                     { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g",  {{0, 16}, {1, 16}}}},
-                { g_os_is_yx_osv16_isv4,                       { 1, 1, 2, 0, 1, "gbfxy",  "bfxy????g",  {{0, 16}, {1, 4}}}},
+                { g_os_is_yx_osv16_isv4,                       { 1, 1, 2, 0, 1, "gbfyx",  "bfxy????g",  {{0, 16}, {1, 4}}}},
+                { g_os_is_zyx_osv16_isv16,                     { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g",  {{0, 16}, {1, 16}}}},
                 { g_os_zyx_is_osv16_isv4,                      { 1, 1, 3, 0, 1, "gbzyxi", "bfxyz???g",  {{0, 16}, {1, 4}}}},
                 { g_os_zyx_is_osv16_isv16,                     { 1, 1, 3, 0, 1, "gbzyxi", "bfxyz???g",  {{0, 16}, {1, 16}}}},
                 { g_os_zyx_is_osv16_isv32,                     { 1, 1, 3, 0, 1, "gbzyxi", "bfxyz???g",  {{0, 16}, {1, 32}}}},
index b9a29d0..8553bf7 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -73,6 +73,8 @@ WeightsTensor::WeightsChannelArray WeightsTensor::weightsChannelArray {{
     { WeightsLayout::os_i_osv16__ai8,                             { -1, -1, -1,   0,   1, -1, -1, -1 } },
     { WeightsLayout::os_i_osv16,                                  { -1, -1, -1,   0,   1, -1, -1, -1 } },
     { WeightsLayout::os_is_yx_osv16_isv16,                        {  0,  1, -1,   2,   3, -1, -1, -1 } },
+    { WeightsLayout::os_is_zyx_osv16_isv16,                       {  0,  1,  2,   3,   4, -1, -1, -1 } },
+    { WeightsLayout::g_os_is_zyx_osv16_isv16,                     {  0,  1,  2,   3,   4, -1, -1,  5 } },
     { WeightsLayout::os_is_zyx_osv32_isv16,                       {  0,  1,  2,   3,   4, -1, -1, -1 } },
     { WeightsLayout::os_is_zyx_osv64_isv16,                       {  0,  1,  2,   3,   4, -1, -1, -1 } },
     { WeightsLayout::i_yxs_os_yxsv2_osv16,                        {  1,  2, -1,   3,   0, -1, -1, -1 } },
@@ -425,7 +427,7 @@ DataTensor DataTensor::FlattenEverything() const {
 NDims WeightsTensor::GetSimpleDims(const std::vector<size_t>& d, WeightsLayout l) {
     std::vector<size_t> newDims = d;
 
-    // TOOD: it's not the right pitches. it's here in order to calculate physical size
+    // TODO: It's not the right pitches. it's here in order to calculate physical size
     switch (l) {
         case os_iyx_osv16:
         case os_iyx_osv16_rotate_180:
@@ -635,6 +637,16 @@ NDims WeightsTensor::GetSimpleDims(const std::vector<size_t>& d, WeightsLayout l
             newDims[2] = RoundUp(newDims[2], 16);
             newDims[3] = RoundUp(newDims[3], 16);
             break;
+        case os_is_zyx_osv16_isv16:
+            assert(newDims.size() == 5);
+            newDims[3] = RoundUp(newDims[3], 16);
+            newDims[4] = RoundUp(newDims[4], 16);
+            break;
+        case g_os_is_zyx_osv16_isv16:
+            assert(newDims.size() == 6);
+            newDims[3] = RoundUp(newDims[3], 16);
+            newDims[4] = RoundUp(newDims[4], 16);
+            break;
         case os_is_zyx_osv32_isv16:
             newDims[3] = RoundUp(newDims[3], 16);
             newDims[4] = RoundUp(newDims[4], 32);
index 2226f1a..03f0c22 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -92,6 +92,7 @@ enum WeightsLayout {
     os_i_osv16__ai8,
     os_i_osv16,
     os_is_yx_osv16_isv16,           // wieghts for int8 blocked conv
+    os_is_zyx_osv16_isv16,
     os_is_zyx_osv32_isv16,
     os_is_zyx_osv64_isv16,
     i_yxs_os_yxsv2_osv16,
@@ -142,6 +143,7 @@ enum WeightsLayout {
     g_os_is_zyx_isv8_osv16_isv2,
     g_os_is_yx_isv8_osv16_isv2,
     g_os_is_zyx_isv16_osv16,
+    g_os_is_zyx_osv16_isv16,
     giy_xs_os_xsv2_osv16__ao32,
     giy_xs_os_xsv2_osv8__ao32,
     g_os_is_yx_isv16_osv16,
index f10d56c..36abefe 100644 (file)
@@ -24,11 +24,13 @@ ParamsKey ConcatenationKernel_simple_Ref::GetSupportedKey() const {
     k.EnableInputDataType(Datatype::F16);
     k.EnableInputDataType(Datatype::F32);
     k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
     k.EnableInputDataType(Datatype::INT32);
     k.EnableInputDataType(Datatype::INT64);
     k.EnableOutputDataType(Datatype::F16);
     k.EnableOutputDataType(Datatype::F32);
     k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
     k.EnableOutputDataType(Datatype::INT32);
     k.EnableOutputDataType(Datatype::INT64);
     k.EnableInputLayout(DataLayout::bfyx);
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "convolution_kernel_b_fs_yx_fsv16_imad.h"
+#include "convolution_kernel_b_fs_zyx_fsv16_imad.h"
 #include "kernel_selector_utils.h"
 #include "common_tools.h"
 #include <vector>
@@ -63,8 +63,8 @@ static size_t getOutBlock_X(const size_t output_size_x, const size_t stride_x, c
 
 namespace kernel_selector {
 
-Convolution_kernel_b_fs_yx_fsv16_imad::BlockParams
-Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params& params) const {
+Convolution_kernel_b_fs_zyx_fsv16_imad::BlockParams
+Convolution_kernel_b_fs_zyx_fsv16_imad::GetBlockParams(const convolution_params& params) const {
     constexpr float max_reg_pressure = 0.75f;
 
     // TODO Investigate whether below algorithm for selecting optimal block params could be reduced to:
@@ -85,9 +85,9 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
     size_t block_features = simd;
     {
         size_t tmp_block_features = simd * 2;
-        auto block2_params = BlockParams{ block_width, 1, tmp_block_features, in_block_width, 1, 1 };
+        auto block2_params = BlockParams{ block_width, 1, 1, tmp_block_features, in_block_width, 1, 1, 1 };
 
-        bool c_mul_f = params.output.Feature().v % tmp_block_features == 0;
+        bool c_mul_f = params.weights.OFM().v % tmp_block_features == 0;
         bool c_reg_pressure = EstimateRegPressure(params, block2_params) <= max_reg_pressure;
 
         if (c_mul_f && c_reg_pressure) {
@@ -97,7 +97,9 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
 
     // If not enough occupancy try to perform feature split or/and block reduction
     size_t feature_slm_split = 1;
-    auto no_split_params = BlockParams{ block_width, 1, block_features, in_block_width, 1, 1 };
+
+    auto no_split_params = BlockParams{ block_width, 1, 1, block_features, in_block_width, 1, 1, 1 };
+
     if (EstimateOccupancy(params, no_split_params) < 1.f) {
         // Temporary variables for possible reductions in block sizes
         bool update_block_params = false;
@@ -107,7 +109,8 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
 
         // Feature split requires extra registers, so check if it can be done with current block sizes
         bool can_split =
-            EstimateRegPressure(params, BlockParams{ block_width, 1, block_features, in_block_width, 1, 2 }) <= max_reg_pressure;
+            EstimateRegPressure(params, BlockParams{ block_width, 1, 1, block_features, in_block_width, 1, 1, 2 }) <= max_reg_pressure;
+
         // Has the occupancy reached sufficient level
         bool enough_occupancy = false;
         // Reductions to reduce register pressure
@@ -116,7 +119,7 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
             // At most twice reduction in output block width is acceptable
             for (size_t w = block_width; w >= CeilDiv(block_width, 2); w -= 1) {
                 size_t tmp_in_width = (w - 1) * params.stride.x + (params.filterSize.x - 1) * params.dilation.x + 1;
-                auto dummy_split_params = BlockParams{ w, 1, block_features, tmp_in_width, 1, 2 };
+                auto dummy_split_params = BlockParams{ w, 1, 1, block_features, tmp_in_width, 1, 1, 2 };
 
                 bool c_reg_pressure = EstimateRegPressure(params, dummy_split_params) <= max_reg_pressure;
                 bool c_mul_x = params.output.X().v % w == 0;
@@ -139,7 +142,7 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
         }
         // Check if previous reductions haven't improved occupancy enough
         {
-            auto reduced_params = BlockParams{ split_block_width, 1, split_block_features, split_in_block_width, 1, 1 };
+            auto reduced_params = BlockParams{ split_block_width, 1, 1, split_block_features, split_in_block_width, 1, 1, 1 };
             enough_occupancy = EstimateOccupancy(params, reduced_params) >= 1.f;
             update_block_params = enough_occupancy;
         }
@@ -147,7 +150,7 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
         if (can_split && !enough_occupancy) {
             // TODO Try other split sizes
             for (size_t split = 4; split < 5; ++split) {
-                auto tmp_params = BlockParams{ block_width, 1, block_features, in_block_width, 1, split };
+                auto tmp_params = BlockParams{ block_width, 1, 1, block_features, in_block_width, 1, 1, split };
 
                 bool c_ifm_mul = CeilDiv(params.weights.IFM().v, fsv) % split == 0;
                 bool c_slm = EstimateSLMUsage(params, tmp_params) <= 1.f;
@@ -172,7 +175,7 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
             // At most twice reduction in output block width is acceptable
             for (size_t w = block_width; w >= CeilDiv(block_width, 2); w -= 1) {
                 size_t tmp_in_width = (w - 1) * params.stride.x + (params.filterSize.x - 1) * params.dilation.x + 1;
-                auto tmp_params = BlockParams{ w, 1, split_block_features, tmp_in_width, 1, feature_slm_split };
+                auto tmp_params = BlockParams{ w, 1, 1, split_block_features, tmp_in_width, 1, 1,  feature_slm_split };
 
                 bool c_occupancy = EstimateOccupancy(params, tmp_params) >= 1.f;
                 bool c_mul_x = params.output.X().v % w == 0;
@@ -194,44 +197,60 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
         }
     }
 
-    // Select biggest block height that fits into registers
+    // Select biggest block height and depth that fits into registers
     size_t block_height = 1;
+    size_t block_depth = 1;
     size_t in_block_height = 1;
-    for (size_t h = 2; h < 16; ++h) {
-        if (params.output.Y().v % h != 0)
-            continue;
+    size_t in_block_depth = 1;
 
-        size_t tmp_in_block_height = (h - 1) * params.stride.y + (params.filterSize.y - 1) * params.dilation.y + 1;
-        auto tmp_params = BlockParams{ block_width, h, block_features, in_block_width, tmp_in_block_height, feature_slm_split };
-
-        bool c_reg_pressure = EstimateRegPressure(params, tmp_params) <= max_reg_pressure;
-        bool c_occupancy = EstimateOccupancy(params, tmp_params) >= 1.f;
-        bool c_slm = EstimateSLMUsage(params, tmp_params) <= 1.f;
+    bool break_external_loop = false;
+    
+    for (size_t d = 1; d < 16; ++d) {
+        if (params.output.Z().v % d != 0)
+            continue;
+        for (size_t h = 2; h < 16; ++h) {
+            if (params.output.Y().v % h != 0)
+                continue;
+            size_t tmp_in_block_depth = (d - 1) * params.stride.z + (params.filterSize.z - 1) * params.dilation.z + 1;
+            size_t tmp_in_block_height = (h - 1) * params.stride.y + (params.filterSize.y - 1) * params.dilation.y + 1;
+            auto tmp_params = BlockParams{ block_width, h, d, block_features, in_block_width, tmp_in_block_height, tmp_in_block_depth, feature_slm_split };
+
+            bool c_reg_pressure = EstimateRegPressure(params, tmp_params) <= max_reg_pressure;
+            bool c_occupancy = EstimateOccupancy(params, tmp_params) >= 1.f;
+            bool c_slm = EstimateSLMUsage(params, tmp_params) <= 1.f;
+
+            if (c_reg_pressure && c_occupancy && c_slm) {
+                block_height = h;
+                block_depth = d;
+                in_block_height = tmp_in_block_height;
+                in_block_depth = tmp_in_block_depth;
+            } else {
+                break_external_loop = true;
+                break;
+            }
+        }
 
-        if (c_reg_pressure && c_occupancy && c_slm) {
-            block_height = h;
-            in_block_height = tmp_in_block_height;
-        } else {
+        if (break_external_loop) {
             break;
         }
     }
 
-    return BlockParams{ block_width, block_height, block_features, in_block_width, in_block_height, feature_slm_split };
+    return BlockParams{ block_width, block_height, block_depth, block_features, in_block_width, in_block_height, in_block_depth, feature_slm_split };
 }
 
-float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateRegPressure(const convolution_params& params, const BlockParams& block) const {
+float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateRegPressure(const convolution_params& params, const BlockParams& block) const {
     size_t bytes_used = 0;
     // accumulator
-    size_t accumulator_elements = block.output_block_width * block.output_block_height * block.output_block_features;
+    size_t accumulator_elements = block.output_block_width * block.output_block_height * block.output_block_depth * block.output_block_features;
     bytes_used += accumulator_elements * BytesPerElement(GetAccumulatorType(params));
     // input block
-    size_t input_block_elements = block.input_block_height * Align(block.input_block_width, simd) * fsv;
+    size_t input_block_elements = block.input_block_depth * block.input_block_height * Align(block.input_block_width, simd) * fsv;
     bytes_used += input_block_elements * BytesPerElement(params.inputs[0].GetDType());
     // weights block
     size_t weights_block_elements = block.output_block_features * fsv;
     bytes_used += weights_block_elements * BytesPerElement(params.weights.GetDType());
 
-    // Experimentally selected number of registers needed for extra variables (eg. out_x, out_y, filter_idx, etc.)
+    // Experimentally selected number of registers needed for extra variables (eg. out_x, out_y, out_z, filter_idx, etc.)
     constexpr size_t experimental_extra_regs = 8 * 32;
     bytes_used += experimental_extra_regs;
 
@@ -248,13 +267,14 @@ float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateRegPressure(const convoluti
     return static_cast<float>(bytes_used) / static_cast<float>(max_reg_bytes);
 }
 
-float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateOccupancy(const convolution_params& params, const BlockParams& block) const {
+float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateOccupancy(const convolution_params& params, const BlockParams& block) const {
     size_t blocks_w = CeilDiv(params.output.X().v, block.output_block_width);
     size_t blocks_h = CeilDiv(params.output.Y().v, block.output_block_height);
-    size_t blocks_f = CeilDiv(params.output.Feature().v, block.output_block_features) * block.feature_slm_split;
+    size_t blocks_d = CeilDiv(params.output.Z().v, block.output_block_depth);
+    size_t blocks_f = CeilDiv(params.weights.OFM().v, block.output_block_features) * params.groups * block.feature_slm_split;
     size_t block_b = params.output.Batch().v;
 
-    auto threads = blocks_w * blocks_h * blocks_f * block_b;
+    auto threads = blocks_w * blocks_h * blocks_d * blocks_f * block_b;
     constexpr size_t max_threads_per_cu = 7;
     size_t compute_units = params.engineInfo.computeUnitsCount;
     size_t max_threads = compute_units * max_threads_per_cu;
@@ -262,17 +282,18 @@ float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateOccupancy(const convolution
     return static_cast<float>(threads) / static_cast<float>(max_threads);
 }
 
-float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateSLMUsage(const convolution_params& params, const BlockParams& block) const {
-    size_t slm_elements = block.output_block_width * block.output_block_height * block.output_block_features * (block.feature_slm_split - 1);
+float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateSLMUsage(const convolution_params& params, const BlockParams& block) const {
+    size_t slm_elements = block.output_block_width * block.output_block_height * block.output_block_depth * 
+                          block.output_block_features * (block.feature_slm_split - 1);
     size_t slm_bytes = slm_elements * BytesPerElement(GetAccumulatorType(params));
 
-    // TODO Actual maximum slm should also depend on number of work-groups, but this is device specific
+    // TODO Actual maximum slm should also depend on number of work-groups, but this is device specific
     size_t max_slm_bytes = params.engineInfo.maxLocalMemSize;
 
     return static_cast<float>(slm_bytes) / static_cast<float>(max_slm_bytes);
 }
 
-ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad::GetSupportedKey() const {
+ParamsKey Convolution_kernel_b_fs_zyx_fsv16_imad::GetSupportedKey() const {
     ParamsKey k;
     k.EnableInputDataType(Datatype::INT8);
     k.EnableInputDataType(Datatype::UINT8);
@@ -284,6 +305,9 @@ ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad::GetSupportedKey() const {
 
     k.EnableInputWeightsType(WeightsType::INT8);
 
+    k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
+    k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
+
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
 
@@ -294,31 +318,36 @@ ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad::GetSupportedKey() const {
     k.EnableBiasPerFeature();
     k.EnableNonBiasTerm();
     k.EnableBatching();
+    k.EnableGroupedConvolution();
     k.EnableQuantization(QuantizationType::SYMMETRIC);
     k.EnableDilation();
     k.DisableTuning();
     return k;
 }
 
-KernelsData Convolution_kernel_b_fs_yx_fsv16_imad::GetKernelsData(const Params& params,
+KernelsData Convolution_kernel_b_fs_zyx_fsv16_imad::GetKernelsData(const Params& params,
                                                                    const optional_params& options) const {
     return GetCommonKernelsData(params, options);
 }
 
-JitConstants Convolution_kernel_b_fs_yx_fsv16_imad::GetJitConstants(const convolution_params& params,
+JitConstants Convolution_kernel_b_fs_zyx_fsv16_imad::GetJitConstants(const convolution_params& params,
                                                                      const DispatchData& kd) const {
     auto mem_consts = Parent::GetJitConstants(params, kd);
 
     auto block_params = GetBlockParams(params);
 
     bool unroll_filter_y = block_params.output_block_height != 1;
+    bool unroll_filter_z = block_params.output_block_depth != 1;
 
     mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block_params.output_block_width));
     mem_consts.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", block_params.input_block_width));
     mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block_params.output_block_height));
     mem_consts.AddConstant(MakeJitConstant("IN_BLOCK_HEIGHT", block_params.input_block_height));
+    mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_DEPTH", block_params.output_block_depth));
+    mem_consts.AddConstant(MakeJitConstant("IN_BLOCK_DEPTH", block_params.input_block_depth));
     mem_consts.AddConstant(MakeJitConstant("FILTER_SIZE_Y_UNROLL", unroll_filter_y ? params.filterSize.y : 1));
-    mem_consts.AddConstant(MakeJitConstant("OFM_BLOCKS_PER_SIMD", block_params.output_block_features / simd));
+    mem_consts.AddConstant(MakeJitConstant("FILTER_SIZE_Z_UNROLL", unroll_filter_z ? params.filterSize.z : 1));
+    mem_consts.AddConstant(MakeJitConstant("OFM_BLOCKS_PER_SIMD", static_cast<int>(std::ceil(block_params.output_block_features / simd))));
     mem_consts.AddConstant(MakeJitConstant("OFM_SIZE_PER_SIMD", block_params.output_block_features));
     mem_consts.AddConstant(MakeJitConstant("FEATURE_SLM_SPLIT", block_params.feature_slm_split));
     mem_consts.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
@@ -327,7 +356,20 @@ JitConstants Convolution_kernel_b_fs_yx_fsv16_imad::GetJitConstants(const convol
     if (!params.fused_ops.empty()) {
         auto input_dt = GetActivationType(params);
         std::vector<std::string> idx_order = { "out_b", "(out_f + ofb * 16)", "(out_y + oh)", "(out_x + ow)" };
+        if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
+            idx_order = { "out_b", "(out_f + ofb * 16)", "(out_z + od)", "(out_y + oh)", "(out_x + ow)" };
+        }
+
         std::vector<Tensor::DataChannelName> loop_axes = { Tensor::DataChannelName::X };
+
+        if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
+            if (block_params.output_block_depth != 1) {
+                loop_axes.push_back(Tensor::DataChannelName::Z);
+            } else {
+                idx_order[idx_order.size() - 3] = "out_z";
+            }
+        }
+        
         if (block_params.output_block_height != 1) {
             loop_axes.push_back(Tensor::DataChannelName::Y);
         } else {
@@ -349,15 +391,16 @@ JitConstants Convolution_kernel_b_fs_yx_fsv16_imad::GetJitConstants(const convol
     return mem_consts;
 }  // GetJitConstants
 
-ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_yx_fsv16_imad::SetDefault(const convolution_params& params,
+ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_zyx_fsv16_imad::SetDefault(const convolution_params& params,
                                                                            int) const {
     DispatchData kd;
     const auto& output = params.output;
+    const auto& weights = params.weights;
     auto block_params = GetBlockParams(params);
 
     kd.gws0 = CeilDiv(output.X().v, block_params.output_block_width);
-    kd.gws1 = CeilDiv(output.Y().v, block_params.output_block_height);
-    kd.gws2 = output.Batch().v * CeilDiv(output.Feature().v, block_params.output_block_features) * simd * block_params.feature_slm_split;
+    kd.gws1 = CeilDiv(output.Y().v, block_params.output_block_height) * CeilDiv(output.Z().v, block_params.output_block_depth);
+    kd.gws2 = output.Batch().v * CeilDiv(weights.OFM().v, block_params.output_block_features) * params.groups * simd * block_params.feature_slm_split;
 
     kd.lws0 = 1;
     kd.lws1 = 1;
@@ -367,16 +410,13 @@ ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_yx_fsv16_imad::SetDe
     kd.gemmStyle = {0, 0, 0, 0, 0, 0};
 
     kd.efficiency = FORCE_PRIORITY_2;
-    // TODO Optimize 1x1, because this kernel is better in most cases
-    //if (params.filterSize.x == 1 && params.filterSize.y == 1)
-    //    kd.efficiency = FORCE_PRIORITY_1;
     if (static_cast<float>(params.weights.IFM().v) / static_cast<float>(Align(params.weights.IFM().v, fsv)) < 0.5f)
         kd.efficiency = FORCE_PRIORITY_4;
 
     return kd;
 }  // SetDefault
 
-bool Convolution_kernel_b_fs_yx_fsv16_imad::Validate(const Params& params, const optional_params& options) const {
+bool Convolution_kernel_b_fs_zyx_fsv16_imad::Validate(const Params& params, const optional_params& options) const {
     if (!Parent::Validate(params, options)) {
         return false;
     }
@@ -384,7 +424,7 @@ bool Convolution_kernel_b_fs_yx_fsv16_imad::Validate(const Params& params, const
     KernelData kd = KernelData::Default<convolution_params>(params);
     convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
 
-    if (newParams.groups != 1 || newParams.split != 1)
+    if (newParams.split != 1)
         return false;
 
     return true;
 
 namespace kernel_selector {
 
-class Convolution_kernel_b_fs_yx_fsv16_imad : public ConvolutionKernelBase {
+class Convolution_kernel_b_fs_zyx_fsv16_imad : public ConvolutionKernelBase {
 public:
     using Parent = ConvolutionKernelBase;
-    Convolution_kernel_b_fs_yx_fsv16_imad() : ConvolutionKernelBase("convolution_gpu_b_fs_yx_fsv16_imad") {}
-    virtual ~Convolution_kernel_b_fs_yx_fsv16_imad() {}
+    Convolution_kernel_b_fs_zyx_fsv16_imad() : ConvolutionKernelBase("convolution_gpu_b_fs_zyx_fsv16_imad") {}
+    virtual ~Convolution_kernel_b_fs_zyx_fsv16_imad() {}
 
     KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
     ParamsKey GetSupportedKey() const override;
@@ -35,8 +35,8 @@ protected:
     JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
     DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
     bool NeedPaddedInput() const override { return true; }
-    WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
-        return WeightsLayout::os_is_yx_osv16_isv16;
+    WeightsLayout GetPreferredWeightsLayout(const convolution_params& p) const override {
+        return p.groups > 1 ? WeightsLayout::g_os_is_zyx_osv16_isv16 : WeightsLayout::os_is_zyx_osv16_isv16;
     }
 
     std::vector<FusedOpType> GetSupportedFusedOps() const override {
@@ -49,10 +49,13 @@ protected:
     struct BlockParams {
         size_t output_block_width;
         size_t output_block_height;
+        size_t output_block_depth;
+        
         size_t output_block_features;
 
         size_t input_block_width;
         size_t input_block_height;
+        size_t input_block_depth;
 
         size_t feature_slm_split;
     };
index e87157e..e6a954d 100644 (file)
@@ -60,7 +60,7 @@
 #include "convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.h"
 #include "convolution_kernel_bfyx_to_bs_fs_yx_bsv16_fsv16.h"
 #include "convolution_kernel_b_fs_yx_fsv16_imad_1x1.h"
-#include "convolution_kernel_b_fs_yx_fsv16_imad.h"
+#include "convolution_kernel_b_fs_zyx_fsv16_imad.h"
 #include "convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.hpp"
 #include "convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1.h"
 #include "convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3.h"
@@ -71,9 +71,9 @@ convolution_kernel_selector::convolution_kernel_selector() {
     Attach<ConvolutionKernel_Ref>();
     Attach<DeformableConvolutionKernel_bfyx_Ref>();
 
-    // b_fs_yx_fsv16 int8
+    // b_fs_yx_fsv16 and b_fs_zyx_fsv16 int8
     Attach<Convolution_kernel_b_fs_yx_fsv16_imad_1x1>();
-    Attach<Convolution_kernel_b_fs_yx_fsv16_imad>();
+    Attach<Convolution_kernel_b_fs_zyx_fsv16_imad>();
 
     // b_fs_yx_fsv16 and b_fs_zyx_fsv16
     Attach<ConvolutionKernel_b_fs_yx_fsv16_depthwise>();
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "pooling_kernel_gpu_b_fs_yx_fsv16_imad.h"
+#include "pooling_kernel_gpu_b_fs_zyx_fsv16_imad.h"
 #include "kernel_selector_utils.h"
 
 #define FEATURE_SLICE_SIZE 16
 
 namespace kernel_selector {
-ParamsKey PoolingKernelGPU_b_fs_yx_fsv16_imad::GetSupportedKey() const {
+ParamsKey PoolingKernelGPU_b_fs_zyx_fsv16_imad::GetSupportedKey() const {
     ParamsKey k;
     k.EnableInputDataType(Datatype::INT8);
     k.EnableInputDataType(Datatype::UINT8);
@@ -27,6 +27,8 @@ ParamsKey PoolingKernelGPU_b_fs_yx_fsv16_imad::GetSupportedKey() const {
     k.EnableOutputDataType(Datatype::F32);
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
+    k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
     k.EnableTensorOffset();
     k.EnableTensorPitches();
     k.EnableBatching();
@@ -41,17 +43,18 @@ ParamsKey PoolingKernelGPU_b_fs_yx_fsv16_imad::GetSupportedKey() const {
     return k;
 }
 
-PoolingKernelBase::DispatchData PoolingKernelGPU_b_fs_yx_fsv16_imad::SetDefault(const pooling_params& params) const {
+PoolingKernelBase::DispatchData PoolingKernelGPU_b_fs_zyx_fsv16_imad::SetDefault(const pooling_params& params) const {
     DispatchData runInfo = PoolingKernelBase::SetDefault(params);
 
     const auto& out = params.output;
     auto x = out.X().v;
     auto y = out.Y().v;
+    auto z = out.Z().v;
     auto f = out.Feature().v;
     auto b = out.Batch().v;
 
     runInfo.gws0 = x;
-    runInfo.gws1 = y;
+    runInfo.gws1 = y * z;
     // we got b_fs_yx_fsv16 format, we process 16 features per workitem
     runInfo.gws2 = CeilDiv(f, FEATURE_SLICE_SIZE) * b;
 
@@ -64,19 +67,24 @@ PoolingKernelBase::DispatchData PoolingKernelGPU_b_fs_yx_fsv16_imad::SetDefault(
     return runInfo;
 }
 
-JitConstants PoolingKernelGPU_b_fs_yx_fsv16_imad::GetJitConstants(const pooling_params& params, DispatchData kd) const {
+JitConstants PoolingKernelGPU_b_fs_zyx_fsv16_imad::GetJitConstants(const pooling_params& params, DispatchData kd) const {
     auto jit = PoolingKernelBase::GetJitConstants(params, kd);
 
     const size_t in_x_pitch = FEATURE_SLICE_SIZE;
     const size_t in_y_pitch = FEATURE_SLICE_SIZE * params.inputs[0].X().LogicalDimPadded();
+    const size_t in_z_pitch = FEATURE_SLICE_SIZE * params.inputs[0].Y().LogicalDimPadded() * params.inputs[0].X().LogicalDimPadded();
     jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
     jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
+    jit.AddConstant(MakeJitConstant("IN_Z_PITCH", in_z_pitch));
     jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
     jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
 
     if (!params.fused_ops.empty()) {
         auto input_dt = EnableRound(params) ? Datatype::INT32 : GetActivationType(params);
         FusedOpsConfiguration conf = {"", {"b", "(f+i)", "y", "x"}, "pool_result[i]", input_dt, 1};
+        if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
+            conf = {"", {"b", "(f+i)", "z", "y", "x"}, "pool_result[i]", input_dt, 1 };
+        }
         conf.SetLoopAxes({ Tensor::DataChannelName::FEATURE }, true);
         jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
     }
@@ -84,19 +92,11 @@ JitConstants PoolingKernelGPU_b_fs_yx_fsv16_imad::GetJitConstants(const pooling_
     return jit;
 }
 
-KernelsData PoolingKernelGPU_b_fs_yx_fsv16_imad::GetKernelsData(const Params& params, const optional_params& options) const {
+KernelsData PoolingKernelGPU_b_fs_zyx_fsv16_imad::GetKernelsData(const Params& params, const optional_params& options) const {
     return GetCommonKernelsData(params, options, FORCE_PRIORITY_1);
 }
 
-bool PoolingKernelGPU_b_fs_yx_fsv16_imad::Validate(const Params& params, const optional_params& options) const {
-    if (!PoolingKernelBase::Validate(params, options)) {
-        return false;
-    }
-    auto p = dynamic_cast<const pooling_params&>(params);
-
-    if (p.inputs[0].Feature().v % FEATURE_SLICE_SIZE != 0)
-        return false;
-
-    return true;
+bool PoolingKernelGPU_b_fs_zyx_fsv16_imad::Validate(const Params& params, const optional_params& options) const {
+    return PoolingKernelBase::Validate(params, options);
 }
 }  // namespace kernel_selector
 #include <vector>
 
 namespace kernel_selector {
-class PoolingKernelGPU_b_fs_yx_fsv16_imad: public PoolingKernelBase{
+class PoolingKernelGPU_b_fs_zyx_fsv16_imad: public PoolingKernelBase{
 public:
-    PoolingKernelGPU_b_fs_yx_fsv16_imad() : PoolingKernelBase("pooling_gpu_b_fs_yx_fsv16_imad") {}
-    virtual ~PoolingKernelGPU_b_fs_yx_fsv16_imad() {}
+    PoolingKernelGPU_b_fs_zyx_fsv16_imad() : PoolingKernelBase("pooling_gpu_b_fs_zyx_fsv16_imad") {}
+    virtual ~PoolingKernelGPU_b_fs_zyx_fsv16_imad() {}
 
     KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
     ParamsKey GetSupportedKey() const override;
index 11ae5cf..27305ac 100644 (file)
@@ -24,7 +24,7 @@
 #include "pooling_kernel_gpu_fs_b_yx_fsv32.h"
 #include "pooling_kernel_gpu_b_fs_yx_fsv16.h"
 #include "pooling_kernel_gpu_bsv16_fsv16.h"
-#include "pooling_kernel_gpu_b_fs_yx_fsv16_imad.h"
+#include "pooling_kernel_gpu_b_fs_zyx_fsv16_imad.h"
 #include "pooling_kernel_gpu_bs_fs_yx_bsv16_fsv16.h"
 
 namespace kernel_selector {
@@ -40,7 +40,7 @@ pooling_kernel_selector::pooling_kernel_selector() {
     Attach<PoolingKerneGPU_fs_b_yx_fsv32>();
     Attach<PoolingKernel_b_fs_yx_fsv16>();
     Attach<PoolingKernel_bsv16_fsv16>();
-    Attach<PoolingKernelGPU_b_fs_yx_fsv16_imad>();
+    Attach<PoolingKernelGPU_b_fs_zyx_fsv16_imad>();
     Attach<Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16>();
 }
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_yx_fsv16_imad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_yx_fsv16_imad.cl
deleted file mode 100644 (file)
index 25961dc..0000000
+++ /dev/null
@@ -1,390 +0,0 @@
-// Copyright (c) 2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/common.cl"
-#include "include/fetch.cl"
-#include "include/imad.cl"
-#include "include/mmad.cl"
-#include "include/data_types.cl"
-
-#define AS_TYPE_N_(type, n, x) as_##type##n(x)
-#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
-#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
-
-#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
-
-#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
-#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
-
-#define SIMD 16
-#define FSV 16
-
-// int8 conv_input and weights data is packed to int32 "batches",
-// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
-__attribute__((intel_reqd_sub_group_size(SIMD)))
-__attribute__((reqd_work_group_size(1, 1, FEATURE_SLM_SPLIT * SIMD)))
-KERNEL(convolution_gpu_b_fs_yx_fsv16_imad)(
-    const __global INPUT0_TYPE *conv_input,
-    __global OUTPUT_TYPE *output,
-    const __global FILTER_TYPE *weights,
-#if BIAS_TERM
-    const __global BIAS_TYPE *biases,
-#endif
-#if HAS_FUSED_OPS_DECLS
-    FUSED_OPS_DECLS,
-#endif
-    uint split_idx) {
-
-    #define LUT_VALUE_CLAMP(x) (( (IN_BLOCK_WIDTH % SIMD == 0) || ((x) < IN_BLOCK_WIDTH % SIMD) ) ? (x) : 0)
-    const int tmp = LUT_VALUE_CLAMP(get_sub_group_local_id());
-    #undef LUT_VALUE_CLAMP
-
-    const uint out_x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;
-    const uint out_y = (uint)get_global_id(1) * OUT_BLOCK_HEIGHT;
-    const uint out_b = (uint)(get_group_id(2) * OFM_SIZE_PER_SIMD) / ALIGN(OUTPUT_FEATURE_NUM, OFM_SIZE_PER_SIMD);
-    uint out_fg = (uint)(get_group_id(2) * OFM_SIZE_PER_SIMD) % ALIGN(OUTPUT_FEATURE_NUM, OFM_SIZE_PER_SIMD);
-    uint out_f = out_fg + get_sub_group_local_id();
-
-    const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
-    const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
-#if FEATURE_SLM_SPLIT == 1
-    const uint k_start = 0;
-#else
-    const uint k_start = get_sub_group_id() * FSV;
-#endif
-
-    uint filter_idx  = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f, k_start, 0, 0);
-    const uint filter_idx_diff = (ALIGN(FILTER_IFM_NUM, 16) * FILTER_SIZE_X * FILTER_SIZE_Y * 16);
-
-    uint input_start_idx = INPUT0_GET_INDEX(out_b, k_start, input_y, input_x);
-
-    ACCUMULATOR_TYPE dotProd[OFM_BLOCKS_PER_SIMD][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH] = { };
-    uint4 input_val[IN_BLOCK_HEIGHT][CEIL_DIV(IN_BLOCK_WIDTH, SIMD)];
-
-    __attribute__((opencl_unroll_hint(1)))
-    for (uint k = 0; k < CEIL_DIV(INPUT0_FEATURE_NUM, 16) / FEATURE_SLM_SPLIT; k++) {
-        __attribute__((opencl_unroll_hint(1)))
-        for (uint fyn = 0; fyn < FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL; fyn++) {
-            // Load input block IN_BLOCK_HEIGHT x IN_BLOCK_WIDTH, scattering width along sub-group
-            __attribute__((opencl_unroll_hint))
-            for (uint iyb = 0; iyb < IN_BLOCK_HEIGHT; ++iyb) {
-                __attribute__((opencl_unroll_hint))
-                for (uint ixb = 0; ixb < CEIL_DIV(IN_BLOCK_WIDTH, SIMD); ++ixb) {
-                    uint input_idx = input_start_idx + iyb * INPUT0_Y_PITCH * FSV + ixb * SIMD * FSV;
-                    if (ixb != CEIL_DIV(IN_BLOCK_WIDTH, SIMD) - 1) {
-                        input_val[iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * 16));
-                    } else {
-                        input_val[iyb][ixb] = vload4(0, (__global uint*)(conv_input + input_idx + tmp * 16));
-                    }
-                }
-            }
-
-            __attribute__((opencl_unroll_hint))
-            for (uint fyu = 0; fyu < FILTER_SIZE_Y_UNROLL; ++fyu) {
-                __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
-                for (uint fx = 0; fx < FILTER_SIZE_X; fx++) {
-
-                    uint4 weights_val[OFM_BLOCKS_PER_SIMD];
-                    __attribute__((opencl_unroll_hint))
-                    for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
-                        weights_val[ofb] = vload4(0, (__global uint *)(weights + filter_idx + ofb * filter_idx_diff));
-                    }
-
-                    __attribute__((opencl_unroll_hint))
-                    for (uint ive = 0; ive < 4; ive++) {
-                        __attribute__((opencl_unroll_hint))
-                        for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
-                            __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
-                            for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-                                __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
-                                for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
-                                    const uint ow_offset = ow + OUT_BLOCK_WIDTH;
-                                    const uint y_block_idx = oh * STRIDE_SIZE_Y + fyu * DILATION_SIZE_Y;
-                                    const uint x_block_idx = ow * STRIDE_SIZE_X + fx * DILATION_SIZE_X;
-                                    const uint shuffle_wi = x_block_idx % SIMD;
-                                    const uint shuffle_idx = x_block_idx / SIMD;
-
-                                    dotProd[ofb][oh][ow] = TO_ACCUMULATOR_TYPE(
-                                        IMAD(dotProd[ofb][oh][ow],
-                                        AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[y_block_idx][shuffle_idx][ive], shuffle_wi)),
-                                        AS_FILTER_TYPE_4(weights_val[ofb][ive])));
-                                }
-                            }
-                        }
-                    }
-
-                    filter_idx += FSV * FSV;
-                }
-            }
-            input_start_idx += DILATION_SIZE_Y * INPUT0_Y_PITCH * FSV;
-        }
-        input_start_idx += INPUT0_FEATURE_PITCH * FSV * FEATURE_SLM_SPLIT - (FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL) * DILATION_SIZE_Y * INPUT0_Y_PITCH * FSV;
-
-        filter_idx += FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * (FEATURE_SLM_SPLIT - 1);
-    }
-
-#if FEATURE_SLM_SPLIT != 1
-    // Additional local memory reduction for feature split mode
-#   if FEATURE_SLM_SPLIT < OFM_BLOCKS_PER_SIMD
-#   error convolution_gpu_b_fs_yx_fsv16_imad.cl - OFM_BLOCKS_PER_SIMD must be less or equal to FEATURE_SLM_SPLIT
-#   endif
-
-    const uint partial_acc_size = (FEATURE_SLM_SPLIT - 1) * OFM_SIZE_PER_SIMD * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH;
-    __local ACCUMULATOR_TYPE partial_acc[partial_acc_size];
-
-    uint sgid_start_idx = get_sub_group_id();
-    sgid_start_idx = sgid_start_idx == 0 ? 0 : sgid_start_idx - 1;
-    __local ACCUMULATOR_TYPE* partial_acc_ptr = partial_acc + sgid_start_idx * OFM_SIZE_PER_SIMD * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH
-                                                              + get_sub_group_local_id();
-
-    if (get_sub_group_id() < OFM_BLOCKS_PER_SIMD) {
-        __attribute__((opencl_unroll_hint))
-        for (uint wg = 0; wg < OFM_BLOCKS_PER_SIMD; ++wg) {
-            if (get_sub_group_id() == wg) {
-                __attribute__((opencl_unroll_hint))
-                for (uint ofb = 0; ofb < wg; ++ofb) {
-                    __attribute__((opencl_unroll_hint))
-                    for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-                        __attribute__((opencl_unroll_hint))
-                        for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
-                            const uint partial_acc_ptr_idx =
-                                ofb * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
-                                oh * OUT_BLOCK_WIDTH * SIMD +
-                                ow * SIMD;
-                            partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][oh][ow];
-                        }
-                    }
-                }
-                __attribute__((opencl_unroll_hint))
-                for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-                    __attribute__((opencl_unroll_hint))
-                    for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
-                        dotProd[0][oh][ow] = dotProd[wg][oh][ow];
-                    }
-                }
-                __attribute__((opencl_unroll_hint))
-                for (uint ofb = wg + 1; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
-                    __attribute__((opencl_unroll_hint))
-                    for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-                        __attribute__((opencl_unroll_hint))
-                        for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
-                            const uint partial_acc_ptr_idx =
-                                ((wg != 0) ? OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OFM_SIZE_PER_SIMD : 0) +
-                                ofb * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
-                                oh * OUT_BLOCK_WIDTH * SIMD +
-                                ow * SIMD;
-                            partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][oh][ow];
-                        }
-                    }
-                }
-            }
-        }
-    } else {
-        __attribute__((opencl_unroll_hint))
-        for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
-            __attribute__((opencl_unroll_hint))
-            for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-                __attribute__((opencl_unroll_hint))
-                for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
-                    const uint partial_acc_ptr_idx =
-                        ofb * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
-                        oh * OUT_BLOCK_WIDTH * SIMD +
-                        ow * SIMD;
-                    partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][oh][ow];
-                }
-            }
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (get_sub_group_id() >= OFM_BLOCKS_PER_SIMD)
-        return;
-
-    partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * SIMD + get_sub_group_local_id();
-    __attribute__((opencl_unroll_hint))
-    for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
-        __attribute__((opencl_unroll_hint))
-        for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-            __attribute__((opencl_unroll_hint))
-            for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
-                const uint partial_acc_ptr_idx =
-                    wg * OFM_SIZE_PER_SIMD * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH +
-                    oh * OUT_BLOCK_WIDTH * SIMD +
-                    ow * SIMD;
-                dotProd[0][oh][ow] += partial_acc_ptr[partial_acc_ptr_idx];
-            }
-        }
-    }
-#endif
-
-#if FEATURE_SLM_SPLIT == 1
-#   define OFM_VALUES_PER_WI (OFM_BLOCKS_PER_SIMD)
-#else
-#   define OFM_VALUES_PER_WI 1
-    out_f += get_sub_group_id() * SIMD;
-    out_fg += get_sub_group_id() * SIMD;
-#endif
-
-#if BIAS_TERM
-    BIAS_TYPE bias[OFM_VALUES_PER_WI];
-    __attribute__((opencl_unroll_hint))
-    for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
-        bias[ofb] = biases[out_f + ofb * SIMD];
-    }
-#endif
-
-    ACTIVATION_TYPE dequantized[OFM_VALUES_PER_WI][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
-    __attribute__((opencl_unroll_hint))
-    for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
-        __attribute__((opencl_unroll_hint))
-        for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-            __attribute__((opencl_unroll_hint))
-            for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
-                dequantized[ofb][oh][ow] = TO_ACTIVATION_TYPE(dotProd[ofb][oh][ow]);
-#if BIAS_TERM
-                dequantized[ofb][oh][ow] += bias[ofb];
-#endif
-            }
-        }
-    }
-
-    OUTPUT_TYPE result[OFM_VALUES_PER_WI][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
-    __attribute__((opencl_unroll_hint))
-    for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
-#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
-        FUSED_OPS_PRELOAD_SCALAR;
-#endif
-        __attribute__((opencl_unroll_hint))
-        for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-            __attribute__((opencl_unroll_hint))
-            for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
-                ACTIVATION_TYPE dequantized_val = dequantized[ofb][oh][ow];
-#if HAS_FUSED_OPS
-#   if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
-                FUSED_OPS_CALC_SCALAR;
-#   else
-                FUSED_OPS_SCALAR;
-#   endif
-                result[ofb][oh][ow] = FUSED_OPS_RESULT_SCALAR;
-#else
-                result[ofb][oh][ow] = TO_OUTPUT_TYPE(dequantized_val);
-#endif
-            }
-        }
-    }
-
-    uint dst_index = OUTPUT_GET_INDEX(out_b, out_fg, out_y, out_x);
-
-    if ((OUTPUT_SIZE_X % OUT_BLOCK_WIDTH == 0 || out_x + OUT_BLOCK_WIDTH <= OUTPUT_SIZE_X)
-        && (OUTPUT_FEATURE_NUM % OFM_BLOCKS_PER_SIMD == 0) ) {
-        __attribute__((opencl_unroll_hint(OFM_VALUES_PER_WI)))
-        for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
-            bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_fg + ofb * SIMD <= OUTPUT_FEATURE_NUM);
-            if (good_of_block) {
-                __attribute__((opencl_unroll_hint))
-                for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-                    bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
-                    if (good_y) {
-                        uint ow = 0;
-                    #if OUTPUT_TYPE_SIZE == 1
-                        __attribute__((opencl_unroll_hint))
-                        for (; ow + 8 <= OUT_BLOCK_WIDTH; ow += 8) {
-                            MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
-                            __attribute__((opencl_unroll_hint))
-                            for (uint i = 0; i < 8; ++i) {
-                                result_val[i] = result[ofb][oh][ow + i];
-                            }
-                            DT_OUTPUT_BLOCK_WRITE8(output, dst_index, result_val);
-                            dst_index += 8 * SIMD;
-                        }
-                    #endif
-                    #if OUTPUT_TYPE_SIZE <= 2
-                        __attribute__((opencl_unroll_hint))
-                        for (; ow + 4 <= OUT_BLOCK_WIDTH; ow += 4) {
-                            MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
-                            __attribute__((opencl_unroll_hint))
-                            for (uint i = 0; i < 4; ++i) {
-                                result_val[i] = result[ofb][oh][ow + i];
-                            }
-                            DT_OUTPUT_BLOCK_WRITE4(output, dst_index, result_val);
-                            dst_index += 4 * SIMD;
-                        }
-                    #endif
-
-                        __attribute__((opencl_unroll_hint))
-                        for (; ow + 2 <= OUT_BLOCK_WIDTH; ow += 2) {
-                            MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
-                            __attribute__((opencl_unroll_hint))
-                            for (uint i = 0; i < 2; ++i) {
-                                result_val[i] = result[ofb][oh][ow + i];
-                            }
-                            DT_OUTPUT_BLOCK_WRITE2(output, dst_index, result_val);
-                            dst_index += 2 * SIMD;
-                        }
-
-                        if (OUT_BLOCK_WIDTH % 2 == 1) {
-                            OUTPUT_TYPE result_val = result[ofb][oh][ow];
-                            DT_OUTPUT_BLOCK_WRITE(output, dst_index, result_val);
-                            dst_index += 1 * SIMD;
-                        }
-                    }  // if (good_y)
-                    dst_index += OUTPUT_Y_PITCH * FSV - OUT_BLOCK_WIDTH * FSV;
-                }  // for (OUT_BLOCK_HEIGHT)
-            }  // if (good_of_block)
-            dst_index += OUTPUT_FEATURE_PITCH * FSV - OUTPUT_Y_PITCH * FSV * OUT_BLOCK_HEIGHT;
-        }  // for (OFM_VALUES_PER_WI)
-    } else {
-        __attribute__((opencl_unroll_hint(OFM_VALUES_PER_WI)))
-        for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
-            bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_fg + ofb * SIMD <= OUTPUT_FEATURE_NUM);
-            if (good_of_block) {
-                const uint dst_index = OUTPUT_GET_INDEX(out_b, out_f + ofb * SIMD, out_y, out_x);
-                __attribute__((opencl_unroll_hint))
-                for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-                    bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
-                    if (good_y) {
-                        __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
-                        for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
-
-#if OUTPUT_SIZE_X % OUT_BLOCK_WIDTH != 0
-                            if (out_x + OUT_BLOCK_WIDTH > OUTPUT_SIZE_X && ow >= OUTPUT_SIZE_X % OUT_BLOCK_WIDTH)
-                                break;
-#endif
-
-#if OUTPUT_FEATURE_NUM % SIMD != 0
-                            if (out_fg + (ofb + 1) * SIMD >= OUTPUT_FEATURE_NUM && get_sub_group_local_id() >= OUTPUT_FEATURE_NUM % SIMD)
-                                result[ofb][oh][ow] = (OUTPUT_TYPE)0;
-#endif
-                            output[dst_index + ow * FSV + oh * OUTPUT_Y_PITCH * FSV] = result[ofb][oh][ow];
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-#undef AS_INPUT0_TYPE_4
-#undef AS_TYPE_N
-#undef AS_TYPE_N_
-#undef AS_FILTER_TYPE_4
-
-#undef CEIL_DIV
-#undef ALIGN
-
-#undef SIMD
-#undef FSV
-#undef OFM_VALUES_PER_WI
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_zyx_fsv16_imad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_zyx_fsv16_imad.cl
new file mode 100644 (file)
index 0000000..3a98477
--- /dev/null
@@ -0,0 +1,516 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/common.cl"
+#include "include/fetch.cl"
+#include "include/imad.cl"
+#include "include/mmad.cl"
+#include "include/data_types.cl"
+
+#define AS_TYPE_N_(type, n, x) as_##type##n(x)
+#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
+#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
+
+#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
+#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
+
+#define SIMD 16
+#define FSV 16
+
+// int8 conv_input and weights data is packed to int32 "batches",
+// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
+__attribute__((intel_reqd_sub_group_size(SIMD)))
+__attribute__((reqd_work_group_size(1, 1, FEATURE_SLM_SPLIT * SIMD)))
+KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
+    const __global INPUT0_TYPE *conv_input,
+    __global OUTPUT_TYPE *output,
+    const __global FILTER_TYPE *weights,
+#if BIAS_TERM
+    const __global BIAS_TYPE *biases,
+#endif
+#if HAS_FUSED_OPS_DECLS
+    FUSED_OPS_DECLS,
+#endif
+    uint split_idx) {
+
+    #define LUT_VALUE_CLAMP(x) (( (IN_BLOCK_WIDTH % SIMD == 0) || ((x) < IN_BLOCK_WIDTH % SIMD) ) ? (x) : 0)
+    const int tmp = LUT_VALUE_CLAMP(get_sub_group_local_id());
+    #undef LUT_VALUE_CLAMP
+
+    const uint out_x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;
+    const uint out_y = ((uint)get_global_id(1) / ALIGN(OUTPUT_SIZE_Z, OUT_BLOCK_DEPTH)) * OUT_BLOCK_HEIGHT;
+#if INPUT0_DIMS == 4
+    const uint out_z = 0;
+#else
+    const uint out_z = ((uint)get_global_id(1) % ALIGN(OUTPUT_SIZE_Z, OUT_BLOCK_DEPTH)) * OUT_BLOCK_DEPTH;
+#endif
+    const uint out_b = (uint)(get_group_id(2) / CEIL_DIV(FILTER_OFM_NUM, OFM_SIZE_PER_SIMD)) / FILTER_GROUPS_NUM;
+    const uint g = (uint)(get_group_id(2) / CEIL_DIV(FILTER_OFM_NUM, OFM_SIZE_PER_SIMD)) % FILTER_GROUPS_NUM;
+    uint out_f_sg = (uint)(get_group_id(2) * OFM_SIZE_PER_SIMD) % (ALIGN(FILTER_OFM_NUM, OFM_SIZE_PER_SIMD) * FILTER_GROUPS_NUM);
+    uint out_f = out_f_sg + get_sub_group_local_id();
+    uint out_f_g = (out_f % ALIGN(FILTER_OFM_NUM, OFM_SIZE_PER_SIMD));
+#if FILTER_OFM_NUM % SIMD != 0
+    out_f = out_f - (out_f / ALIGN(FILTER_OFM_NUM, SIMD)) * (SIMD - (FILTER_OFM_NUM % SIMD));
+#endif
+
+    const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
+    const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
+    const int input_z = out_z * STRIDE_SIZE_Z - PADDING_SIZE_Z;
+
+#if FEATURE_SLM_SPLIT == 1
+    const uint k_start = 0;
+#else
+    const uint k_start = get_sub_group_id() * FSV;
+#endif
+
+    uint filter_idx  = GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(FILTER, g, out_f_g, k_start, 0, 0, 0);
+    const uint filter_idx_diff = (ALIGN(FILTER_IFM_NUM, FSV) * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z * FSV);
+
+#if INPUT0_DIMS == 4
+    uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + k_start, input_y, input_x);
+#else
+    uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + k_start, input_z, input_y, input_x);
+#endif
+
+    ACCUMULATOR_TYPE dotProd[OFM_BLOCKS_PER_SIMD][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH] = { };
+#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
+    uint in_f_offset = (g * FILTER_IFM_NUM) % FSV;
+#endif
+
+    uint4 input_val[IN_BLOCK_DEPTH][IN_BLOCK_HEIGHT][CEIL_DIV(IN_BLOCK_WIDTH, SIMD)];
+
+    __attribute__((opencl_unroll_hint(1)))
+    for (uint k = 0; k < CEIL_DIV(FILTER_IFM_NUM, FSV) / FEATURE_SLM_SPLIT; k++) {
+        __attribute__((opencl_unroll_hint(1)))
+        for (uint fzn = 0; fzn < FILTER_SIZE_Z / FILTER_SIZE_Z_UNROLL; fzn++) {
+            __attribute__((opencl_unroll_hint(1)))
+            for (uint fyn = 0; fyn < FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL; fyn++) {
+                // Load input block IN_BLOCK_DEPTH x IN_BLOCK_HEIGHT x IN_BLOCK_WIDTH, scattering width along sub-group
+                __attribute__((opencl_unroll_hint))
+                for (uint izb = 0; izb < IN_BLOCK_DEPTH; ++izb) {
+                    __attribute__((opencl_unroll_hint))
+                    for (uint iyb = 0; iyb < IN_BLOCK_HEIGHT; ++iyb) {
+                        __attribute__((opencl_unroll_hint))
+                        for (uint ixb = 0; ixb < CEIL_DIV(IN_BLOCK_WIDTH, SIMD); ++ixb) {
+                            uint input_idx = input_start_idx + izb * INPUT0_Z_PITCH * FSV + iyb * INPUT0_Y_PITCH * FSV + ixb * SIMD * FSV;
+
+                            if (ixb != CEIL_DIV(IN_BLOCK_WIDTH, SIMD) - 1) {
+                                #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
+                                if (in_f_offset == 0) {
+                                    input_val[izb][iyb][ixb] = as_uint4(vload16(0, conv_input + input_idx + get_sub_group_local_id() * FSV));
+                                #else
+                                    input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * FSV));
+                                #endif
+                                #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
+                                } else {
+                                    INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
+                                    __attribute__((opencl_unroll_hint(FSV)))
+                                    for (uint v = 0; v < FSV; v++) {
+                                        if (v + in_f_offset < FSV) {
+                                            input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v];
+                                        } else {
+                                            input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v + 
+                                                                           ((INPUT0_SIZE_X + 2*PADDING_SIZE_X) * 
+                                                                            (INPUT0_SIZE_Y + 2*PADDING_SIZE_Y) * 
+                                                                            (INPUT0_SIZE_Z + 2*PADDING_SIZE_Z) - 1) * 
+                                                                           FSV];
+                                        }
+                                    }
+                                }
+                                #endif
+                            } else {
+                                #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
+                                if (in_f_offset == 0) {
+                                    input_val[izb][iyb][ixb] = as_uint4(vload16(0, conv_input + input_idx + tmp * FSV));
+                                #else
+                                    input_val[izb][iyb][ixb] = vload4(0, (__global uint*)(conv_input + input_idx + tmp * FSV));
+                                #endif
+                                #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
+                                } else {
+                                    INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
+                                    __attribute__((opencl_unroll_hint(FSV)))
+                                    for (uint v = 0; v < FSV; v++) {
+                                        if (v + in_f_offset < FSV) {
+                                            input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v];
+                                        } else {
+                                            input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v + 
+                                                                           ((INPUT0_SIZE_X + 2*PADDING_SIZE_X) * 
+                                                                            (INPUT0_SIZE_Y + 2*PADDING_SIZE_Y) * 
+                                                                            (INPUT0_SIZE_Z + 2*PADDING_SIZE_Z) - 1) * 
+                                                                           FSV];
+                                        }
+                                    }
+                                }
+                                #endif
+                            }
+                        }
+                    }
+                }
+
+                __attribute__((opencl_unroll_hint))
+                for (uint fzu = 0; fzu < FILTER_SIZE_Z_UNROLL; ++fzu) {
+                    __attribute__((opencl_unroll_hint))
+                    for (uint fyu = 0; fyu < FILTER_SIZE_Y_UNROLL; ++fyu) {
+                        __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
+                        for (uint fx = 0; fx < FILTER_SIZE_X; fx++) {
+
+                            uint4 weights_val[OFM_BLOCKS_PER_SIMD];
+                            __attribute__((opencl_unroll_hint))
+                            for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+                                weights_val[ofb] = vload4(0, (__global uint *)(weights + filter_idx + ofb * filter_idx_diff));
+                            }
+
+                            __attribute__((opencl_unroll_hint))
+                            for (uint ive = 0; ive < 4; ive++) {
+                                __attribute__((opencl_unroll_hint))
+                                for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+                                    __attribute__((opencl_unroll_hint(OUT_BLOCK_DEPTH)))
+                                    for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+                                    __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
+                                        for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                                            __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+                                            for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
+                                                const uint ow_offset = ow + OUT_BLOCK_WIDTH;
+                                                const uint z_block_idx = od * STRIDE_SIZE_Z + fzu * DILATION_SIZE_Z;
+                                                const uint y_block_idx = oh * STRIDE_SIZE_Y + fyu * DILATION_SIZE_Y;
+                                                const uint x_block_idx = ow * STRIDE_SIZE_X + fx * DILATION_SIZE_X;
+                                                const uint shuffle_wi = x_block_idx % SIMD;
+                                                const uint shuffle_idx = x_block_idx / SIMD;
+
+                                                dotProd[ofb][od][oh][ow] = TO_ACCUMULATOR_TYPE(
+                                                    IMAD(dotProd[ofb][od][oh][ow],
+                                                    AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[z_block_idx][y_block_idx][shuffle_idx][ive], 
+                                                                                             shuffle_wi)),
+                                                    AS_FILTER_TYPE_4(weights_val[ofb][ive])));
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+
+                            filter_idx += FSV * FSV;
+                        }
+                    }
+                }
+                input_start_idx += DILATION_SIZE_Y * INPUT0_Y_PITCH * FSV;
+            }
+            input_start_idx += DILATION_SIZE_Z * INPUT0_Z_PITCH * FSV - (FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL) * DILATION_SIZE_Y * INPUT0_Y_PITCH * FSV;
+        }
+        input_start_idx += INPUT0_FEATURE_PITCH * FSV * FEATURE_SLM_SPLIT - (FILTER_SIZE_Z / FILTER_SIZE_Z_UNROLL) * DILATION_SIZE_Z * INPUT0_Z_PITCH * FSV;
+
+        filter_idx += FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z * (FEATURE_SLM_SPLIT - 1);
+    }
+
+#if FEATURE_SLM_SPLIT != 1
+    // Additional local memory reduction for feature split mode
+#   if FEATURE_SLM_SPLIT < OFM_BLOCKS_PER_SIMD
+#   error convolution_gpu_b_fs_zyx_fsv16_imad.cl - OFM_BLOCKS_PER_SIMD must be less or equal to FEATURE_SLM_SPLIT
+#   endif
+
+    const uint partial_acc_size = (FEATURE_SLM_SPLIT - 1) * OFM_SIZE_PER_SIMD * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH;
+    __local ACCUMULATOR_TYPE partial_acc[partial_acc_size];
+
+    uint sgid_start_idx = get_sub_group_id();
+    sgid_start_idx = sgid_start_idx == 0 ? 0 : sgid_start_idx - 1;
+    __local ACCUMULATOR_TYPE* partial_acc_ptr = partial_acc + sgid_start_idx * OFM_SIZE_PER_SIMD * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH + 
+                                                get_sub_group_local_id();
+
+    if (get_sub_group_id() < OFM_BLOCKS_PER_SIMD) {
+        __attribute__((opencl_unroll_hint))
+        for (uint wg = 0; wg < OFM_BLOCKS_PER_SIMD; ++wg) {
+            if (get_sub_group_id() == wg) {
+                __attribute__((opencl_unroll_hint))
+                for (uint ofb = 0; ofb < wg; ++ofb) {
+                    __attribute__((opencl_unroll_hint))
+                    for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+                        __attribute__((opencl_unroll_hint))
+                        for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                            __attribute__((opencl_unroll_hint))
+                            for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+                                const uint partial_acc_ptr_idx =
+                                    ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
+                                    od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
+                                    oh * OUT_BLOCK_WIDTH * SIMD +
+                                    ow * SIMD;
+                                partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][od][oh][ow];
+                            }
+                        }
+                    }
+                }
+                __attribute__((opencl_unroll_hint))
+                for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+                    __attribute__((opencl_unroll_hint))
+                    for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                        __attribute__((opencl_unroll_hint))
+                        for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+                            dotProd[0][od][oh][ow] = dotProd[wg][od][oh][ow];
+                        }
+                    }
+                }
+                __attribute__((opencl_unroll_hint))
+                for (uint ofb = wg + 1; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+                    __attribute__((opencl_unroll_hint))
+                    for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+                        __attribute__((opencl_unroll_hint))
+                        for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                            __attribute__((opencl_unroll_hint))
+                            for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+                                const uint partial_acc_ptr_idx =
+                                    ((wg != 0) ? OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * OFM_SIZE_PER_SIMD : 0) +
+                                    ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
+                                    od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
+                                    oh * OUT_BLOCK_WIDTH * SIMD +
+                                    ow * SIMD;
+                                partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][od][oh][ow];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        __attribute__((opencl_unroll_hint))
+        for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+            __attribute__((opencl_unroll_hint))
+            for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+                __attribute__((opencl_unroll_hint))
+                for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                    __attribute__((opencl_unroll_hint))
+                    for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+                        const uint partial_acc_ptr_idx =
+                            ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
+                            od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
+                            oh * OUT_BLOCK_WIDTH * SIMD +
+                            ow * SIMD;
+                        partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][od][oh][ow];
+                    }
+                }
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_sub_group_id() >= OFM_BLOCKS_PER_SIMD)
+        return;
+
+    partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * SIMD + get_sub_group_local_id();
+    __attribute__((opencl_unroll_hint))
+    for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
+        __attribute__((opencl_unroll_hint))
+        for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+            __attribute__((opencl_unroll_hint))
+            for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                __attribute__((opencl_unroll_hint))
+                for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+                    const uint partial_acc_ptr_idx =
+                        wg * OFM_SIZE_PER_SIMD * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH +
+                        od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
+                        oh * OUT_BLOCK_WIDTH * SIMD +
+                        ow * SIMD;
+                    dotProd[0][od][oh][ow] += partial_acc_ptr[partial_acc_ptr_idx];
+                }
+            }
+        }
+    }
+#endif
+
+#if FEATURE_SLM_SPLIT == 1
+#   define OFM_VALUES_PER_WI (OFM_BLOCKS_PER_SIMD)
+#else
+#   define OFM_VALUES_PER_WI 1
+    out_f += get_sub_group_id() * SIMD;
+    out_f_sg += get_sub_group_id() * SIMD;
+#endif
+
+#if BIAS_TERM
+    BIAS_TYPE bias[OFM_VALUES_PER_WI];
+    __attribute__((opencl_unroll_hint))
+    for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
+        bias[ofb] = biases[out_f + ofb * SIMD];
+    }
+#endif
+
+    ACTIVATION_TYPE dequantized[OFM_VALUES_PER_WI][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
+    __attribute__((opencl_unroll_hint))
+    for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
+        __attribute__((opencl_unroll_hint))
+        for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+            __attribute__((opencl_unroll_hint))
+            for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                __attribute__((opencl_unroll_hint))
+                for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+                    dequantized[ofb][od][oh][ow] = TO_ACTIVATION_TYPE(dotProd[ofb][od][oh][ow]);
+#if BIAS_TERM
+                    dequantized[ofb][od][oh][ow] += bias[ofb];
+#endif
+                }
+            }
+        }
+    }
+
+    OUTPUT_TYPE result[OFM_VALUES_PER_WI][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
+    __attribute__((opencl_unroll_hint))
+    for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
+#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
+        FUSED_OPS_PRELOAD_SCALAR;
+#endif
+        __attribute__((opencl_unroll_hint))
+        for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+            __attribute__((opencl_unroll_hint))
+            for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                __attribute__((opencl_unroll_hint))
+                for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+                    ACTIVATION_TYPE dequantized_val = dequantized[ofb][od][oh][ow];
+#if HAS_FUSED_OPS
+#   if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
+                    FUSED_OPS_CALC_SCALAR;
+#   else
+                    FUSED_OPS_SCALAR;
+#   endif
+                    result[ofb][od][oh][ow] = FUSED_OPS_RESULT_SCALAR;
+#else
+                    result[ofb][od][oh][ow] = TO_OUTPUT_TYPE(dequantized_val);
+#endif
+                }
+            }
+        }
+    }
+
+#if OUTPUT_DIMS == 4
+    uint dst_index = OUTPUT_GET_INDEX(out_b, out_f_sg, out_y, out_x);
+#else
+    uint dst_index = OUTPUT_GET_INDEX(out_b, out_f_sg, out_z, out_y, out_x);
+#endif
+
+#if ((FILTER_OFM_NUM % OFM_BLOCKS_PER_SIMD == 0) && ((FILTER_GROUPS_NUM == 1) || (FILTER_OFM_NUM % SIMD == 0)))
+    if ((OUTPUT_SIZE_X % OUT_BLOCK_WIDTH == 0 || out_x + OUT_BLOCK_WIDTH <= OUTPUT_SIZE_X)) {
+        __attribute__((opencl_unroll_hint(OFM_VALUES_PER_WI)))
+        for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
+            bool good_of_block = (CEIL_DIV(FILTER_OFM_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_f_sg + ofb * SIMD <= FILTER_OFM_NUM);
+            if (good_of_block) {
+                __attribute__((opencl_unroll_hint))
+                for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+                    bool good_z = (OUTPUT_SIZE_Z % OUT_BLOCK_DEPTH == 0) || (out_z + od < OUTPUT_SIZE_Z);
+                    if (good_z) {
+                        __attribute__((opencl_unroll_hint))
+                        for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                            bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
+                            if (good_y) {
+                                uint ow = 0;
+                            #if OUTPUT_TYPE_SIZE == 1
+                                __attribute__((opencl_unroll_hint))
+                                for (; ow + 8 <= OUT_BLOCK_WIDTH; ow += 8) {
+                                    MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
+                                    __attribute__((opencl_unroll_hint))
+                                    for (uint i = 0; i < 8; ++i) {
+                                        result_val[i] = result[ofb][od][oh][ow + i];
+                                    }
+                                    DT_OUTPUT_BLOCK_WRITE8(output, dst_index, result_val);
+                                    dst_index += 8 * SIMD;
+                                }
+                            #endif
+                            #if OUTPUT_TYPE_SIZE <= 2
+                                __attribute__((opencl_unroll_hint))
+                                for (; ow + 4 <= OUT_BLOCK_WIDTH; ow += 4) {
+                                    MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
+                                    __attribute__((opencl_unroll_hint))
+                                    for (uint i = 0; i < 4; ++i) {
+                                        result_val[i] = result[ofb][od][oh][ow + i];
+                                    }
+                                    DT_OUTPUT_BLOCK_WRITE4(output, dst_index, result_val);
+                                    dst_index += 4 * SIMD;
+                                }
+                            #endif
+
+                                __attribute__((opencl_unroll_hint))
+                                for (; ow + 2 <= OUT_BLOCK_WIDTH; ow += 2) {
+                                    MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
+                                    __attribute__((opencl_unroll_hint))
+                                    for (uint i = 0; i < 2; ++i) {
+                                        result_val[i] = result[ofb][od][oh][ow + i];
+                                    }
+                                    DT_OUTPUT_BLOCK_WRITE2(output, dst_index, result_val);
+                                    dst_index += 2 * SIMD;
+                                }
+
+                                if (OUT_BLOCK_WIDTH % 2 == 1) {
+                                    OUTPUT_TYPE result_val = result[ofb][od][oh][ow];
+                                    DT_OUTPUT_BLOCK_WRITE(output, dst_index, result_val);
+                                    dst_index += 1 * SIMD;
+                                }
+                            }  // if (good_y)
+                            dst_index += OUTPUT_Y_PITCH * FSV - OUT_BLOCK_WIDTH * FSV;
+                        }  // for (OUT_BLOCK_HEIGHT)
+                    }  // if (good_z)
+                    dst_index += OUTPUT_Z_PITCH * FSV - OUTPUT_Y_PITCH * OUT_BLOCK_HEIGHT * FSV;
+                }  // for (OUT_BLOCK_DEPTH)
+            }  // if (good_of_block)
+            dst_index += OUTPUT_FEATURE_PITCH * FSV - OUTPUT_Z_PITCH * OUT_BLOCK_DEPTH * FSV;
+        }  // for (OFM_VALUES_PER_WI)
+    } else {
+#endif
+        __attribute__((opencl_unroll_hint(OFM_VALUES_PER_WI)))
+        for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
+            bool good_of_block = (CEIL_DIV(FILTER_OFM_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_f_sg + ofb * SIMD <= FILTER_OFM_NUM);
+            if (good_of_block) {
+        #if OUTPUT_DIMS == 4
+                const uint dst_index = OUTPUT_GET_INDEX(out_b, out_f + ofb * SIMD, out_y, out_x);
+        #else
+                const uint dst_index = OUTPUT_GET_INDEX(out_b, out_f + ofb * SIMD, out_z, out_y, out_x);
+        #endif
+                __attribute__((opencl_unroll_hint))
+                for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+                    bool good_z = (OUTPUT_SIZE_Z % OUT_BLOCK_DEPTH == 0) || (out_z + od < OUTPUT_SIZE_Z);
+                    if (good_z) {
+                        __attribute__((opencl_unroll_hint))
+                        for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                            bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
+                            if (good_y) {
+                                __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+                                for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
+
+        #if OUTPUT_SIZE_X % OUT_BLOCK_WIDTH != 0
+                                    if (out_x + OUT_BLOCK_WIDTH > OUTPUT_SIZE_X && ow >= OUTPUT_SIZE_X % OUT_BLOCK_WIDTH)
+                                        break;
+        #endif
+
+                                    if (out_f_g < FILTER_OFM_NUM) {
+                                        output[dst_index + ow * FSV + oh * OUTPUT_Y_PITCH * FSV + od * OUTPUT_Z_PITCH * FSV] = result[ofb][od][oh][ow];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+#if ((FILTER_OFM_NUM % OFM_BLOCKS_PER_SIMD == 0) && ((FILTER_GROUPS_NUM == 1) || (FILTER_OFM_NUM % SIMD == 0)))
+    }
+#endif
+}
+
+#undef AS_INPUT0_TYPE_4
+#undef AS_TYPE_N
+#undef AS_TYPE_N_
+#undef AS_FILTER_TYPE_4
+
+#undef CEIL_DIV
+#undef ALIGN
+
+#undef SIMD
+#undef FSV
+#undef OFM_VALUES_PER_WI
index 8d7c06b..4d9449f 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -353,6 +353,45 @@ inline uint FUNC(get_os_is_zyx_osv_isv_index)(uint o, uint i, uint z, uint y, ui
     return output_offset;
 }
 
+inline uint FUNC(get_g_os_is_zyx_osv_isv_index)(uint g, uint o, uint i, uint z, uint y, uint x,
+    uint x_size, uint y_size, uint z_size, uint i_size, uint o_size, uint osv_size, uint isv_size)
+{
+    const uint isv = i % isv_size;
+    const uint osv = o % osv_size;
+    const uint is = i / isv_size;
+    const uint os = o / osv_size;
+
+    const uint x_pitch = osv_size * isv_size;
+    const uint y_pitch = x_pitch * x_size;
+    const uint z_pitch = y_pitch * y_size;
+    const uint is_pitch = z_pitch * z_size;
+    const uint os_pitch = is_pitch * ((i_size + isv_size - 1) / isv_size);
+    const uint g_pitch = os_pitch * ((o_size + osv_size - 1) / osv_size);
+
+    const uint output_offset =
+        isv +
+        osv * isv_size +
+        x * x_pitch +
+        y * y_pitch +
+        z * z_pitch +
+        is * is_pitch +
+        os * os_pitch +
+        g * g_pitch;
+
+    return output_offset;
+}
+
+#define GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(prefix, g, o, i, z, y, x)   \
+    FUNC_CALL(get_g_os_is_zyx_osv_isv_index)(                                \
+        g, o, i, z, y, x,                                                    \
+        CAT(prefix, _SIZE_X),                                                \
+        CAT(prefix, _SIZE_Y),                                                \
+        CAT(prefix, _SIZE_Z),                                                \
+        CAT(prefix, _IFM_NUM),                                               \
+        CAT(prefix, _OFM_NUM),                                               \
+        16,                                                                  \
+        16)
+
 #define GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(prefix, o, i, y, x) \
     FUNC_CALL(get_os_is_zyx_osv_isv_index)(                       \
         o, i, 0, y, x,                                            \
@@ -364,6 +403,17 @@ inline uint FUNC(get_os_is_zyx_osv_isv_index)(uint o, uint i, uint z, uint y, ui
         16,                                                       \
         16)
 
+#define GET_FILTER_OS_IS_ZYX_OSV16_ISV16_INDEX(prefix, o, i, z, y, x)   \
+    FUNC_CALL(get_os_is_zyx_osv_isv_index)(                             \
+        o, i, z, y, x,                                                  \
+        CAT(prefix, _SIZE_X),                                           \
+        CAT(prefix, _SIZE_Y),                                           \
+        CAT(prefix, _SIZE_Z),                                           \
+        CAT(prefix, _IFM_NUM),                                          \
+        CAT(prefix, _OFM_NUM),                                          \
+        16,                                                             \
+        16)
+
 #define GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(prefix, o, i, z, y, x)   \
     FUNC_CALL(get_os_is_zyx_osv_isv_index)(                             \
         o, i, z, y, x,                                                  \
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv16_imad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv16_imad.cl
deleted file mode 100644 (file)
index c20177f..0000000
+++ /dev/null
@@ -1,206 +0,0 @@
-// Copyright (c) 2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "include/include_all.cl"
-#include "include/data_types.cl"
-
-#define ALIGN_TO(val, multiple) (((val) + (multiple) - 1) / (multiple) * (multiple))
-
-#define AS_TYPE(type, val) CAT(as_, type)(val)
-#define IN_VEC16 MAKE_VECTOR_TYPE(INPUT0_TYPE, 16)
-#define OUT_VEC16 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 16)
-
-#define ACTIVATION_VEC16 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 16)
-#define TO_ACTIVATION_VEC16 CAT(convert_, ACTIVATION_VEC16)
-
-#define FEATURE_SLICE_SIZE 16
-
-#if MAX_POOLING
-    #define INIT_VAL ACCUMULATOR_VAL_MIN
-#elif AVG_POOLING
-    #define INIT_VAL ACCUMULATOR_VAL_ZERO
-#else
-#error
-#endif
-
-
-inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
-{
-#if MAX_POOLING
-    return ACCUMULATOR_MAX_FUNC(tmp, in);
-#elif AVG_POOLING
-    return tmp + in;
-#endif
-}
-
-__attribute__((intel_reqd_sub_group_size(FEATURE_SLICE_SIZE)))
-KERNEL(pooling_gpu_b_fs_yx_fsv16)(
-    const __global INPUT0_TYPE* input,
-    __global OUTPUT_TYPE* output
-#if HAS_FUSED_OPS_DECLS
-    , FUSED_OPS_DECLS
-#endif
-)
-{
-    const uint x    = (uint)get_global_id(0);
-    const uint y    = (uint)get_global_id(1);
-    const uint bf   = (uint)get_global_id(2);
-    const uint f    = (bf * FEATURE_SLICE_SIZE) % ALIGN_TO(INPUT0_FEATURE_NUM, FEATURE_SLICE_SIZE);
-    const uint b    = (bf * FEATURE_SLICE_SIZE) / ALIGN_TO(INPUT0_FEATURE_NUM, FEATURE_SLICE_SIZE);
-
-    const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
-    const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
-    ACCUMULATOR_TYPE result[FEATURE_SLICE_SIZE] = { INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL,
-                                                    INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL };
-
-#ifdef CHECK_BOUNDRY
-    if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
-        offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
-    {
-        return;
-    }
-
-#ifdef DYNAMIC_KERNEL_DIVIDER
-    uint num_elements = 0;
-#endif
-
-    const uint batch_and_feature_offset = INPUT0_GET_INDEX(b, f, 0, 0);
-    __attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
-    for(uint j = 0; j < POOL_SIZE_Y; j++)
-    {
-        int input_offset_y = offset_y + j;
-        bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
-        if(!zero_y)
-        {
-            __attribute__((opencl_unroll_hint(POOL_SIZE_X)))
-            for(uint i = 0; i < POOL_SIZE_X; i++)
-            {
-                int input_offset_x = offset_x + i;
-                bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
-                if(!zero)
-                {
-                    const uint input_idx = batch_and_feature_offset + input_offset_y*IN_Y_PITCH + input_offset_x*IN_X_PITCH;
-
-                    int4 int_data = vload4(0, (__global int*)(input + input_idx));
-                    IN_VEC16 ch16_data = AS_TYPE(IN_VEC16, int_data);
-                    __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
-                    for(uint k = 0; k < FEATURE_SLICE_SIZE; k++)
-                    {
-                        result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
-                    }
-
-#ifdef DYNAMIC_KERNEL_DIVIDER
-                    num_elements++;
-#endif
-                }
-            }
-        }
-    }
-#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
-    const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
-    const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
-    const uint num_elements = (hend - offset_y) * (wend - offset_x);
-#endif
-#else // !CHECK_BOUNDRY
-    uint input_idx = INPUT0_GET_INDEX(b, f, offset_y, offset_x);
-    __attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
-    for(uint j = 0; j < POOL_SIZE_Y; j++)
-    {
-        __attribute__((opencl_unroll_hint(POOL_SIZE_X)))
-        for(uint i = 0; i < POOL_SIZE_X; i++)
-        {
-            int4 int_data = vload4(0, (__global int*)(input + input_idx));
-            IN_VEC16 ch16_data = AS_TYPE(IN_VEC16, int_data);
-            __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
-            for(uint k = 0; k < FEATURE_SLICE_SIZE; k++)
-            {
-                result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
-            }
-
-            input_idx += IN_X_PITCH;
-        }
-        input_idx += (IN_Y_PITCH - POOL_SIZE_X*IN_X_PITCH);
-    }
-
-#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
-    const uint num_elements = POOL_SIZE_X*POOL_SIZE_Y;
-#endif
-#endif
-
-    ACTIVATION_VEC16 pool_result;
-#if defined AVG_POOLING
-#if ENABLE_ROUND
-    __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
-    for(uint i = 0; i < FEATURE_SLICE_SIZE; i++) {
-    #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
-        pool_result[i] = convert_int(round(((float)result[i] / max(num_elements, (uint)1))));
-    #else
-        pool_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
-    #endif
-    }
-#else
-    __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
-    for(uint i = 0; i < FEATURE_SLICE_SIZE; i++) {
-    #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
-        pool_result[i] = (float)result[i] / max(num_elements, (uint)1);
-    #else
-        pool_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X);
-    #endif
-    }
-#endif  // ENABLE_ROUND
-#else  // AVG_POOLING
-    __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
-    for (uint i = 0; i < FEATURE_SLICE_SIZE; ++i) {
-        pool_result[i] = result[i];
-    }
-#endif  // AVG_POOLING
-
-    OUT_VEC16 final_result = (OUTPUT_TYPE)(0);
-#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
-    FUSED_OPS_PRELOAD;
-#endif
-
-    __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
-    for (uint i = 0; i < FEATURE_SLICE_SIZE; ++i) {
-#if HAS_FUSED_OPS
-#if FUSED_OPS_CAN_USE_PRELOAD
-        FUSED_OPS_CALC;
-#else
-        FUSED_OPS;
-#endif
-        final_result[i] = FUSED_OPS_RESULT;
-#else
-        final_result[i] = TO_OUTPUT_TYPE(ACTIVATION(pool_result[i], ACTIVATION_PARAMS));
-#endif
-    }
-
-    const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x);
-
-#if OUTPUT_TYPE_SIZE == 1
-    vstore4(as_uint4(final_result), 0, ((__global uint*)(output + output_pos)));
-#else
-    *((__global OUT_VEC16*)(output + output_pos)) = final_result;
-#endif
-}
-
-#undef ALIGN_TO
-#undef AS_TYPE
-#undef IN_VEC16
-#undef OUT_VEC16
-#undef ACTIVATION_VEC16
-#undef TO_ACTIVATION_VEC16
-#undef INIT_VAL
-#undef FEATURE_SLICE_SIZE
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_zyx_fsv16_imad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_zyx_fsv16_imad.cl
new file mode 100644 (file)
index 0000000..4df265b
--- /dev/null
@@ -0,0 +1,311 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "include/include_all.cl"
+#include "include/data_types.cl"
+
+#define ALIGN_TO(val, multiple) (((val) + (multiple) - 1) / (multiple) * (multiple))
+
+#define AS_TYPE(type, val) CAT(as_, type)(val)
+#define IN_VEC16 MAKE_VECTOR_TYPE(INPUT0_TYPE, 16)
+#define OUT_VEC16 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 16)
+
+#define ACTIVATION_VEC16 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 16)
+#define TO_ACTIVATION_VEC16 CAT(convert_, ACTIVATION_VEC16)
+
+#define FEATURE_SLICE_SIZE 16
+
+#if MAX_POOLING
+    #define INIT_VAL ACCUMULATOR_VAL_MIN
+#elif AVG_POOLING
+    #define INIT_VAL ACCUMULATOR_VAL_ZERO
+#else
+#error
+#endif
+
+inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
+{
+#if MAX_POOLING
+    return ACCUMULATOR_MAX_FUNC(tmp, in);
+#elif AVG_POOLING
+    return tmp + in;
+#endif
+}
+
+__attribute__((intel_reqd_sub_group_size(FEATURE_SLICE_SIZE)))
+KERNEL(pooling_gpu_b_fs_zyx_fsv16)(
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+)
+{
+    const uint x    = (uint)get_global_id(0);
+#if OUTPUT_DIMS == 4
+    const uint y   = (uint)get_global_id(1);
+    const uint z    = 0;
+#else
+    const uint zy   = (uint)get_global_id(1);
+    const uint y    = zy % OUTPUT_SIZE_Y;
+    const uint z    = zy / OUTPUT_SIZE_Y;
+#endif
+    const uint bf   = (uint)get_global_id(2);
+    const uint f    = (bf * FEATURE_SLICE_SIZE) % ALIGN_TO(INPUT0_FEATURE_NUM, FEATURE_SLICE_SIZE);
+    const uint b    = (bf * FEATURE_SLICE_SIZE) / ALIGN_TO(INPUT0_FEATURE_NUM, FEATURE_SLICE_SIZE);
+
+    const bool last_in_f_group = (f == FEATURE_SLICE_SIZE * ((INPUT0_FEATURE_NUM - 1) / FEATURE_SLICE_SIZE));
+
+    const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
+    const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
+    const int offset_z = (int)z*STRIDE_SIZE_Z - PADDING_SIZE_Z;
+
+    ACCUMULATOR_TYPE result[FEATURE_SLICE_SIZE] = { INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL,
+                                                    INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL };
+
+#ifdef CHECK_BOUNDRY
+    if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
+        offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y ||
+        offset_z + POOL_SIZE_Z < 0 || offset_z >= INPUT0_SIZE_Z)
+    {
+        return;
+    }
+
+#ifdef DYNAMIC_KERNEL_DIVIDER
+    uint num_elements = 0;
+#endif
+
+#if INPUT0_DIMS == 4
+    const uint batch_and_feature_offset = INPUT0_GET_INDEX(b, f, 0, 0);
+#else
+    const uint batch_and_feature_offset = INPUT0_GET_INDEX(b, f, 0, 0, 0);
+#endif
+    __attribute__((opencl_unroll_hint(POOL_SIZE_Z)))
+    for(uint pz = 0; pz < POOL_SIZE_Z; pz++)
+    {
+        int input_offset_z = offset_z + pz;
+        bool zero_z = input_offset_z >= INPUT0_SIZE_Z || input_offset_z < 0;
+        if(!zero_z)
+        {
+            __attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
+            for(uint py = 0; py < POOL_SIZE_Y; py++)
+            {
+                int input_offset_y = offset_y + py;
+                bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
+                if(!zero_y)
+                {
+                    __attribute__((opencl_unroll_hint(POOL_SIZE_X)))
+                    for(uint px = 0; px < POOL_SIZE_X; px++)
+                    {
+                        int input_offset_x = offset_x + px;
+                        bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
+                        if(!zero)
+                        {
+                            const uint input_idx = batch_and_feature_offset + input_offset_z*IN_Z_PITCH + input_offset_y*IN_Y_PITCH + input_offset_x*IN_X_PITCH;
+                            IN_VEC16 ch16_data;
+#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+                            if (!last_in_f_group) {
+#endif
+                                ch16_data = AS_TYPE(IN_VEC16, vload4(0, (__global int*)(input + input_idx)));
+#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+                            } else {
+                                __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE)))
+                                for(uint k = 0; k < INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE; k++) {
+                                    ch16_data[k] = input[input_idx + k];
+                                }
+                            }
+#endif
+
+#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+                            if (!last_in_f_group) {
+#endif
+                                __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
+                                for(uint k = 0; k < FEATURE_SLICE_SIZE; k++)
+                                {
+                                    result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
+                                }
+#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+                            } else {
+                                __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE)))
+                                for(uint k = 0; k < INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE; k++)
+                                {
+                                    result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
+                                }
+                            }
+#endif
+
+        #ifdef DYNAMIC_KERNEL_DIVIDER
+                            num_elements++;
+        #endif
+                        }
+                    }
+                }
+            }
+        }
+    }
+#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
+    const int dend = min(offset_z + POOL_SIZE_Z, INPUT0_SIZE_Z + PADDING_SIZE_Z);
+    const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
+    const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
+    const uint num_elements = (dend - offset_z) * (hend - offset_y) * (wend - offset_x);
+#endif
+#else // !CHECK_BOUNDRY
+#if INPUT0_DIMS == 4
+    uint input_idx = INPUT0_GET_INDEX(b, f, offset_y, offset_x);
+#else
+    uint input_idx = INPUT0_GET_INDEX(b, f, offset_z, offset_y, offset_x);
+#endif
+    __attribute__((opencl_unroll_hint(POOL_SIZE_Z)))
+    for(uint pz = 0; pz < POOL_SIZE_Z; pz++)
+    {
+        __attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
+        for(uint py = 0; py < POOL_SIZE_Y; py++)
+        {
+            __attribute__((opencl_unroll_hint(POOL_SIZE_X)))
+            for(uint px = 0; px < POOL_SIZE_X; px++)
+            {
+                IN_VEC16 ch16_data;
+#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+                if (!last_in_f_group) {
+#endif
+                    ch16_data = AS_TYPE(IN_VEC16, vload4(0, (__global int*)(input + input_idx)));
+#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+                } else {
+                    __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE)))
+                    for(uint k = 0; k < INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE; k++) {
+                        ch16_data[k] = input[input_idx + k];
+                    }
+                }
+#endif
+
+#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+                if (!last_in_f_group) {
+#endif
+                    __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
+                    for(uint k = 0; k < FEATURE_SLICE_SIZE; k++)
+                    {
+                        result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
+                    }
+#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+                } else {
+                    __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE)))
+                    for(uint k = 0; k < INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE; k++)
+                    {
+                        result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
+                    }
+                }
+#endif
+                input_idx += IN_X_PITCH;
+            }
+            input_idx += (IN_Y_PITCH - POOL_SIZE_X*IN_X_PITCH);
+        }
+        input_idx += (IN_Z_PITCH - POOL_SIZE_Y*IN_Y_PITCH);
+    }
+
+#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
+    const uint num_elements = POOL_SIZE_X*POOL_SIZE_Y*POOL_SIZE_Z;
+#endif
+#endif
+    
+    ACTIVATION_VEC16 pool_result;
+#if defined AVG_POOLING
+#if ENABLE_ROUND
+    __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
+    for(uint i = 0; i < FEATURE_SLICE_SIZE; i++) {
+    #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
+        pool_result[i] = convert_int(round(((float)result[i] / max(num_elements, (uint)1))));
+    #else
+        pool_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X)));
+    #endif
+    }
+#else
+    __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
+    for(uint i = 0; i < FEATURE_SLICE_SIZE; i++) {
+    #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
+        pool_result[i] = (float)result[i] / max(num_elements, (uint)1);
+    #else
+        pool_result[i] = (float)result[i] / (int)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X);
+    #endif
+    }
+#endif  // ENABLE_ROUND
+#else  // AVG_POOLING
+    __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
+    for (uint i = 0; i < FEATURE_SLICE_SIZE; ++i) {
+        pool_result[i] = result[i];
+    }
+#endif  // AVG_POOLING
+
+    OUT_VEC16 final_result = (OUTPUT_TYPE)(0);
+#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
+    FUSED_OPS_PRELOAD;
+#endif
+
+    __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
+    for (uint i = 0; i < FEATURE_SLICE_SIZE; ++i) {
+#if HAS_FUSED_OPS
+#if FUSED_OPS_CAN_USE_PRELOAD
+        FUSED_OPS_CALC;
+#else
+        FUSED_OPS;
+#endif
+        final_result[i] = FUSED_OPS_RESULT;
+#else
+        final_result[i] = TO_OUTPUT_TYPE(ACTIVATION(pool_result[i], ACTIVATION_PARAMS));
+#endif
+    }
+
+#if OUTPUT_DIMS == 4
+    const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x);
+#else
+    const uint output_pos = OUTPUT_GET_INDEX(b, f, z, y, x);
+#endif
+
+#if OUTPUT_TYPE_SIZE == 1
+#if OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+    if (!last_in_f_group) {
+#endif
+        vstore4(as_uint4(final_result), 0, ((__global uint*)(output + output_pos)));
+#if OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+    } else {
+        __attribute__((opencl_unroll_hint(OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE)))
+        for(uint k = 0; k < OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE; k++) {
+            output[output_pos + k] = final_result[k];
+        }
+    }
+#endif
+#else
+#if OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+    if (!last_in_f_group) {
+#endif
+        *((__global OUT_VEC16*)(output + output_pos)) = final_result;
+#if OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+    } else {
+        __attribute__((opencl_unroll_hint(OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE)))
+        for(uint k = 0; k < OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE; k++) {
+            output[output_pos + k] = final_result[k];
+        }
+    }
+#endif
+#endif
+}
+
+#undef ALIGN_TO
+#undef AS_TYPE
+#undef IN_VEC16
+#undef OUT_VEC16
+#undef ACTIVATION_VEC16
+#undef TO_ACTIVATION_VEC16
+#undef INIT_VAL
+#undef FEATURE_SLICE_SIZE
index 2fdfaf9..fa02365 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -93,6 +93,10 @@ inline uint FUNC(get_input_index)(uint g, uint o, uint i, uint z, uint y, uint x
     return GET_FILTER_GOIYX(INPUT0, g, o, i, y, x);
 #elif defined INPUT0_LAYOUT_OS_IS_YX_OSV16_ISV16
     return GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(INPUT0, o, i, y, x);
+#elif defined INPUT0_LAYOUT_OS_IS_ZYX_OSV16_ISV16
+    return GET_FILTER_OS_IS_ZYX_OSV16_ISV16_INDEX(INPUT0, o, i, z, y, x);
+#elif defined INPUT0_LAYOUT_G_OS_IS_ZYX_OSV16_ISV16
+    return GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(INPUT0, g, o, i, z, y, x);
 #elif defined INPUT0_LAYOUT_OS_IS_ZYX_OSV32_ISV16
     return GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(INPUT0, o, i, z, y, x);
 #elif defined INPUT0_LAYOUT_OS_IS_ZYX_OSV64_ISV16
@@ -224,6 +228,10 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint
     return GET_FILTER_G_OS_IS_YX_ISV16_OSV16_INDEX(OUTPUT, g, o, i, y, x, SUB_GROUP_SIZE);
 #elif defined OUTPUT_LAYOUT_OS_IS_YX_OSV16_ISV16
     return GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(OUTPUT, o, i, y, x);
+#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSV16_ISV16
+    return GET_FILTER_OS_IS_ZYX_OSV16_ISV16_INDEX(OUTPUT, o, i, z, y, x);
+#elif defined OUTPUT_LAYOUT_G_OS_IS_ZYX_OSV16_ISV16
+    return GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(OUTPUT, g, o, i, z, y, x);
 #elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSV32_ISV16
     return GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(OUTPUT, o, i, z, y, x);
 #elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSV64_ISV16
index 03c51b5..a7f18c4 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -306,6 +306,7 @@ std::string toString(WeightsLayout layout) {
         case WeightsLayout::yxio:                                        return "YXIO";
         case WeightsLayout::os_is_yx_isv16_osv16:                        return "OS_IS_YX_ISV16_OSV16";
         case WeightsLayout::os_is_yx_osv16_isv16:                        return "OS_IS_YX_OSV16_ISV16";
+        case WeightsLayout::os_is_zyx_osv16_isv16:                       return "OS_IS_ZYX_OSV16_ISV16";
         case WeightsLayout::os_is_zyx_osv32_isv16:                       return "OS_IS_ZYX_OSV32_ISV16";
         case WeightsLayout::os_is_zyx_osv64_isv16:                       return "OS_IS_ZYX_OSV64_ISV16";
         case WeightsLayout::os_iyx_osv16:                                return "OS_IYX_OSV16";
@@ -371,6 +372,7 @@ std::string toString(WeightsLayout layout) {
         case WeightsLayout::gs_oi_yxs_gsv32_yxsv4:                       return "GS_OI_YXS_GSV32_YXSV4";
         case WeightsLayout::g_os_is_yx_isv16_osv16:                      return "G_OS_IS_YX_ISV16_OSV16";
         case WeightsLayout::g_os_is_yx_osv16_isv4:                       return "G_OS_IS_YX_OSV16_ISV4";
+        case WeightsLayout::g_os_is_zyx_osv16_isv16:                     return "G_OS_IS_ZYX_OSV16_ISV16";
         case WeightsLayout::g_os_zyx_is_osv16_isv4:                      return "G_OS_ZYX_IS_OSV16_ISV4";
         case WeightsLayout::g_os_zyx_is_osv16_isv16:                     return "G_OS_ZYX_IS_OSV16_ISV16";
         case WeightsLayout::g_os_zyx_is_osv16_isv32:                     return "G_OS_ZYX_IS_OSV16_ISV32";
index bf0f992..c382842 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2018 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -118,7 +118,7 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) {
     //     window size spatial Y", filter_size.spatial[1], "First convolution is outside of image. please reduce input
     //     offset Y");
 
-    if (input_layout.format == format::bfzyx) {
+    if (input_layout.format.spatial_num() == 3) {
         // convolution 3D
         CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(),
                                        "Stride spatial Z",
index 1a3e926..6f7ce89 100644 (file)
@@ -199,6 +199,8 @@ attach_convolution_gpu::attach_convolution_gpu() {
     // block i8 format
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw);
+    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv16), val_fw);
+    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv16), val_fw);
     // MMAD
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw);
index 32aa4f2..f0165f6 100644 (file)
@@ -110,6 +110,8 @@ attach_scale_gpu::attach_scale_gpu() {
 
     implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_zyx_fsv16), val_fw);
     implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_zyx_fsv16), val_fw);
+    implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv16), val_fw);
+    implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv16), val_fw);
     implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_zyx_bsv16_fsv16), val_fw);
     implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16), val_fw);
     implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), val_fw);
index 9530664..2705e3e 100644 (file)
@@ -72,12 +72,7 @@ void pre_replace_deconv::run(program_impl& p) {
                                !((_lo.get_optimization_attributes().b_fs_yx_fsv16_network || input_node.get_output_layout().format == format::b_fs_yx_fsv16) &&
                                 _lo.is_format_optimized(node->as<deconvolution>(), format::b_fs_yx_fsv16));
                 // int8/uint8 input
-                perform_opt |= (input_node.get_output_layout().data_type == data_types::i8 || input_node.get_output_layout().data_type == data_types::u8) &&
-                               // imad convolution kernel limitation for groups
-                               (groups == 1 || weights_node.get_output_layout().size.feature[0] % 4 == 0 ||
-                                groups == static_cast<uint32_t>(input_node.get_output_layout().size.feature[0])) &&
-                               // no uint8/int8 3D convolution support
-                               input_node.get_output_layout().format.dimension() == 4;
+                perform_opt |= (input_node.get_output_layout().data_type == data_types::i8 || input_node.get_output_layout().data_type == data_types::u8);
 
                 if (!perform_opt)
                     continue;
index 5fdb893..092b931 100644 (file)
@@ -123,6 +123,7 @@ void prepare_padding::run(program_impl& p) {
         if (conv_layout.format != cldnn::format::bfyx &&
             conv_layout.format != cldnn::format::bf8_xy16 &&
             conv_layout.format != cldnn::format::b_fs_yx_fsv16 &&
+            conv_layout.format != cldnn::format::b_fs_zyx_fsv16 &&
             conv_layout.format != cldnn::format::bs_fs_yx_bsv16_fsv16 &&
             conv_layout.format != cldnn::format::byxf_af32 &&
             conv_layout.format != cldnn::format::fs_bs_yx_bsv4_fsv32 &&
@@ -154,11 +155,15 @@ void prepare_padding::run(program_impl& p) {
                              (filter_layout.size.spatial[0] - 1) * dilation.spatial[0] + 1;
         auto input_limit_y = input_offset.spatial[1] + (conv_layout.size.spatial[1] - 1) * stride.spatial[1] +
                              (filter_layout.size.spatial[1] - 1) * dilation.spatial[1] + 1;
+        auto input_limit_z = input_offset.spatial[2] + (conv_layout.size.spatial[2] - 1) * stride.spatial[2] +
+                             (filter_layout.size.spatial[2] - 1) * dilation.spatial[2] + 1;
 
-        auto left_padding = std::max(-input_offset.spatial[0], 0);
-        auto top_padding = std::max(-input_offset.spatial[1], 0);
-        auto right_padding = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
-        auto bottom_padding = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
+        auto padding_begin_x = std::max(-input_offset.spatial[0], 0);
+        auto padding_begin_y = std::max(-input_offset.spatial[1], 0);
+        auto padding_begin_z = std::max(-input_offset.spatial[2], 0);
+        auto padding_end_x = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
+        auto padding_end_y = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
+        auto padding_end_z = std::max(input_limit_z - prev_prim_output_layout.size.spatial[2], 0);
 
         // Adjust right padding, so entire buffer size in X dimension is properly aligned.
         // TODO: NOTE: Will be reenabled with next check-in once heuristic for line-aligned algorithm will be added.
@@ -166,7 +171,7 @@ void prepare_padding::run(program_impl& p) {
         //    round_up_to(left_padding + prev_prim_output_layout.size.spatial[0] + right_padding, 16));
         // right_padding = needed_buffer_size_x - left_padding - prev_prim_output_layout.size.spatial[0];
 
-        cldnn::padding needed_padding({0, 0, left_padding, top_padding}, {0, 0, right_padding, bottom_padding}, 0);
+        cldnn::padding needed_padding({0, 0, padding_begin_x, padding_begin_y, padding_begin_z}, {0, 0, padding_end_x, padding_end_y, padding_end_z}, 0);
         needed_padding = padding::max(prev_prim_output_layout.data_padding, needed_padding);
         p.apply_needed_padding(node, conv_input_node, needed_padding);
     }
@@ -209,13 +214,17 @@ void prepare_padding::run(program_impl& p) {
                              (filter_layout.size.spatial[0] - 1) * dilation.spatial[0] + 1;
         auto input_limit_y = input_offset.spatial[1] + (conv_layout.size.spatial[1] - 1) * stride.spatial[1] +
                              (filter_layout.size.spatial[1] - 1) * dilation.spatial[1] + 1;
+        auto input_limit_z = input_offset.spatial[2] + (conv_layout.size.spatial[2] - 1) * stride.spatial[2] +
+                             (filter_layout.size.spatial[2] - 1) * dilation.spatial[2] + 1;
 
-        auto left_padding = std::max(-input_offset.spatial[0], 0);
-        auto top_padding = std::max(-input_offset.spatial[1], 0);
-        auto right_padding = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
-        auto bottom_padding = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
+        auto padding_begin_x = std::max(-input_offset.spatial[0], 0);
+        auto padding_begin_y = std::max(-input_offset.spatial[1], 0);
+        auto padding_begin_z = std::max(-input_offset.spatial[2], 0);
+        auto padding_end_x = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
+        auto padding_end_y = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
+        auto padding_end_z = std::max(input_limit_z - prev_prim_output_layout.size.spatial[2], 0);
 
-        cldnn::padding needed_padding({0, 0, left_padding, top_padding}, {0, 0, right_padding, bottom_padding}, 0);
+        cldnn::padding needed_padding({0, 0, padding_begin_x, padding_begin_y, padding_begin_z}, {0, 0, padding_end_x, padding_end_y, padding_end_z}, 0);
         needed_padding = padding::max(prev_prim_output_layout.data_padding, needed_padding);
 
         p.apply_needed_padding(node, conv_input_node, needed_padding);
index f2d8353..b4e7ae1 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2017-2019 Intel Corporation
+// Copyright (c) 2017-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -205,6 +205,8 @@ inline std::string fmt_to_str(format fmt) {
             return "g_os_is_zyx_isv16_osv16";
         case format::g_os_is_yx_osv16_isv4:
             return "g_os_is_yx_osv16_isv4";
+        case format::g_os_is_zyx_osv16_isv16:
+            return "g_os_is_zyx_osv16_isv16";
         case format::g_os_zyx_is_osv16_isv4:
             return "g_os_zyx_is_osv16_isv4";
         case format::g_os_zyx_is_osv16_isv16:
index 79d9beb..c46a2fb 100644 (file)
@@ -327,6 +327,10 @@ kernel_selector::weights_layout to_weights_layout(format f) {
             return kernel_selector::weights_layout::g_os_is_zyx_isv16_osv16;
         case format::g_os_is_yx_osv16_isv4:
             return kernel_selector::weights_layout::g_os_is_yx_osv16_isv4;
+        case format::os_is_zyx_osv16_isv16:
+            return kernel_selector::weights_layout::os_is_zyx_osv16_isv16;
+        case format::g_os_is_zyx_osv16_isv16:
+            return kernel_selector::weights_layout::g_os_is_zyx_osv16_isv16;
         case format::g_os_zyx_is_osv16_isv4:
             return kernel_selector::weights_layout::g_os_zyx_is_osv16_isv4;
         case format::g_os_zyx_is_osv16_isv16:
@@ -450,6 +454,10 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
             return cldnn::format::g_os_is_zyx_isv16_osv16;
         case kernel_selector::weights_layout::os_is_yx_osv16_isv4:
             return cldnn::format::g_os_is_yx_osv16_isv4;
+        case kernel_selector::weights_layout::os_is_zyx_osv16_isv16:
+            return cldnn::format::os_is_zyx_osv16_isv16;
+        case kernel_selector::weights_layout::g_os_is_zyx_osv16_isv16:
+            return cldnn::format::g_os_is_zyx_osv16_isv16;
         case kernel_selector::weights_layout::g_os_zyx_is_osv16_isv4:
             return cldnn::format::g_os_zyx_is_osv16_isv4;
         case kernel_selector::weights_layout::g_os_zyx_is_osv16_isv16:
index f142d3e..feee035 100644 (file)
@@ -375,9 +375,14 @@ bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout,
         auto ks_x = weights_layout.size.spatial[0];
         auto ks_y = weights_layout.size.spatial[1];
 
+        size_t in_features_per_group = input_layout.size.feature[0] / conv->groups;
+        size_t out_features_per_group = weights_layout.size.batch[0] / conv->groups;
+        if (weights_layout.format.group_num() > 0) {
+            out_features_per_group = weights_layout.size.batch[0];
+        }
+
         // Check for non-grouped or depthwise convolution
         if (input_layout.size.spatial[2] == 1 &&
-            input_layout.size.batch[0] < 16 &&
             ((ks_x == 7 && ks_y == 7) || (ks_x == 3 && ks_y == 3) || (ks_x == 1 && ks_y == 1) || (ks_x == 5 && ks_y == 5)) &&
             weights_layout.size.batch[0] >= 16 &&
             ((conv->groups == 1 && conv->split() == 1) ||
@@ -388,12 +393,17 @@ bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout,
             return true;
         // Check for grouped convolution
         else if (input_layout.size.spatial[2] == 1 && input_layout.size.batch[0] < 16 &&
-                 weights_layout.size.batch[0] >= 16 &&
-                ((input_layout.size.feature[0] / conv->groups) % 4 == 0) &&
-                ((conv->dilation.spatial[0] + 1) * (ks_x - 1)) < 16 &&
-                (conv->activations_zero_points.empty() && conv->weights_zero_points.empty()))
-            return true;
-
+                 out_features_per_group >= 16 &&
+                 // Need to extend imad fsv4 kernel to handle e.g. 3 input features per group
+                 (in_features_per_group % 4 == 0) &&
+                 ((conv->dilation.spatial[0] + 1) * (ks_x - 1)) <= 16 &&
+                 (conv->activations_zero_points.empty() && conv->weights_zero_points.empty()))
+                return true;
+        // Check for fsv16 imad kernel
+        else if ((input_layout.format.dimension() == 4) &&
+                 (conv->activations_zero_points.empty() && conv->weights_zero_points.empty()) &&
+                 (!((conv->groups > 1) && (in_features_per_group == 1) && (out_features_per_group == 1))))
+                return true;
         return false;
     }
     // A set of rules that define when b_fs_yx_fsv16 mem format can be used for fp16/fp32 case
@@ -459,6 +469,20 @@ bool layout_optimizer::convolution_b_fs_zyx_fsv16_opt(layout const &input_layout
         (weights_layout.size.batch[0] % 16 == 0 || (weights_layout.size.batch[0] == 8 && conv->groups > 1)) &&
         conv->dilation == tensor(1))
         return true;
+
+    size_t in_features_per_group = input_layout.size.feature[0] / conv->groups;
+    size_t out_features_per_group = weights_layout.size.batch[0] / conv->groups;
+    if (weights_layout.format.group_num() > 0) {
+        out_features_per_group = weights_layout.size.batch[0];
+    }
+
+    // Check for fsv16 imad kernel
+    if ((input_layout.format.dimension() == 5) &&
+        (conv->activations_zero_points.empty() && conv->weights_zero_points.empty()) &&
+        (input_layout.data_type == data_types::i8 || input_layout.data_type == data_types::u8) &&
+        (weights_layout.data_type == data_types::i8 || weights_layout.data_type == data_types::u8) &&
+        (!((conv->groups > 1) && (in_features_per_group == 1) && (out_features_per_group == 1))))
+        return true;
     return false;
 }
 
@@ -650,6 +674,9 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
         } else if ((_optimization_attributes.b_fs_yx_fsv16_network &&
             convolution_b_fs_yx_fsv16_opt(input_layout, output_or_weights_layout, prim))) {
             expected_format = cldnn::format::b_fs_yx_fsv16;
+        } else if ((_optimization_attributes.b_fs_zyx_fsv16_network &&
+            convolution_b_fs_zyx_fsv16_opt(input_layout, output_or_weights_layout, prim))) {
+            expected_format = cldnn::format::b_fs_zyx_fsv16;
         } else {
             expected_format = imad_case(node);
         }
@@ -807,7 +834,8 @@ format layout_optimizer::get_preferred_format(program_node& node) {
             layout{ data_types::f32, format::bfyx, tensor{} }).format;
     } else if (node.is_type<quantize>()) {
         auto layout = node.get_output_layout();
-        if ((layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&
+        if (layout.format.spatial_num() == 2 &&
+            (layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&
             layout.size.batch[0] % 16 == 0)
                 expected = format::b_fs_yx_fsv4;
     } else if (node.is_type<reorder>() || node.is_type<input_layout>()) {
index 640105e..66974f2 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -149,26 +149,32 @@ layout pooling_inst::calc_output_layout(parent::typed_node const& node) {
                           0,
                           "Input offset in batch is not supported");
 
-    if (input_layout.format == format::bfzyx) {
+    if (input_layout.format.spatial_num() == 3) {
         // 3D
         CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(),
-                                       "stride spatial Z",
-                                       stride.spatial[1],
-                                       "",
-                                       0,
-                                       "Stride spatial Z must be positive (>= 1)");
+                               "stride spatial Z",
+                               stride.spatial[1],
+                               "",
+                               0,
+                               "Stride spatial Z must be positive (>= 1)");
         CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(),
-                                       "window size spatial Z",
-                                       window_size.spatial[2],
-                                       "",
-                                       0,
-                                       "Size Z (of pooling window) must be positive (>= 1)");
+                               "window size spatial Z",
+                               window_size.spatial[2],
+                               "",
+                               0,
+                               "Size Z (of pooling window) must be positive (>= 1)");
+        CLDNN_ERROR_GREATER_THAN(node.id(),
+                               "Input offset spatial Z",
+                               2 * input_offset.spatial[2],
+                               "input layout size spatial Z",
+                               input_layout.size.spatial[2],
+                               "Input offset is greater than input data range. There is no input data to process");
         CLDNN_ERROR_GREATER_THAN(node.id(),
-                                 "Input offset spatial Z",
-                                 2 * input_offset.spatial[2],
-                                 "input layout size spatial Z",
-                                 input_layout.size.spatial[2],
-                                 "Input offset is greater than input data range. There is no input data to process");
+                               "Negate input offset spatial Z",
+                               -input_offset.spatial[2],
+                               "input window size spatial Z",
+                               window_size.spatial[2],
+                               "First pool is outside of image. please reduce input offset Z");
     }
 
     if (desc->with_output_size) {
index 33b5e55..440f2c1 100644 (file)
@@ -78,83 +78,169 @@ struct convolution_accumulator<uint8_t> {
 };
 
 template<typename InputT, typename OutputT = InputT, typename WeightsT = InputT,  typename AccT = typename convolution_accumulator<InputT>::type>
-VVF<OutputT> reference_convolve(VVVF<InputT> &input, VVVF<WeightsT> &filter, int stride_y, int stride_x, float bias, int dilation_y = 1, int dilation_x = 1,
-        int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0,
-        int output_padding_x = 0, size_t f_begin = 0, size_t f_end = 0, bool depthwise = false, bool grouped = false,
+VVVF<OutputT> reference_convolve(VVVVF<InputT> &input, VVVVF<WeightsT> &filter,
+        int stride_z, int stride_y, int stride_x,
+        float bias,
+        int dilation_z = 1, int dilation_y = 1, int dilation_x = 1,
+        int input_padding_z = 0, int input_padding_y = 0, int input_padding_x = 0,
+        int output_padding_z = 0, int output_padding_y = 0, int output_padding_x = 0,
+        size_t f_begin = 0, size_t f_end = 0, bool depthwise = false, bool grouped = false,
         const VF<InputT>& data_zp = {}, const WeightsT& weights_zp = 0)
 {
-    size_t kernel_extent_y = dilation_y * (filter[0].size() - 1) + 1;
-    size_t kernel_extent_x = dilation_x * (filter[0][0].size() - 1) + 1;
-    size_t output_y = 1 + (input[0].size() - kernel_extent_y + 2 * input_padding_y) / stride_y + 2 * output_padding_y;
-    size_t output_x = 1 + (input[0][0].size() - kernel_extent_x + 2 * input_padding_x) / stride_x + 2 * output_padding_x;
+    size_t kernel_extent_z = dilation_z * (filter[0].size() - 1) + 1;
+    size_t kernel_extent_y = dilation_y * (filter[0][0].size() - 1) + 1;
+    size_t kernel_extent_x = dilation_x * (filter[0][0][0].size() - 1) + 1;
+
+    size_t output_z = 1 + (input[0].size() - kernel_extent_z + 2 * input_padding_z) / stride_z + 2 * output_padding_z;
+    size_t output_y = 1 + (input[0][0].size() - kernel_extent_y + 2 * input_padding_y) / stride_y + 2 * output_padding_y;
+    size_t output_x = 1 + (input[0][0][0].size() - kernel_extent_x + 2 * input_padding_x) / stride_x + 2 * output_padding_x;
+
     bool asymm_data = !data_zp.empty();
     bool asymm_weights = weights_zp != static_cast<WeightsT>(0);
-    VVF<OutputT> output(output_y, VF<OutputT>(output_x, 0));
+    VVVF<OutputT> output(output_z, VVF<OutputT>(output_y, VF<OutputT>(output_x, 0)));
     size_t filter_begin = f_begin ? f_begin : 0;
     size_t filter_end = f_end ? f_end : filter.size();
     for (size_t f = filter_begin; f < filter_end; ++f) {
-        for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
-            for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
-                VF<AccT> values;
-                values.reserve(filter[0].size() * filter[0][0].size());
-                for (size_t yf = 0; yf < filter[0].size(); ++yf) {
-                    int yi = -input_padding_y + (int)yf * dilation_y + stride_y * (int)y;
-                    bool yi_inside = yi >= 0 && (int)input[0].size() > yi;
-                    if (!yi_inside) continue;
-                    for (size_t xf = 0; xf < filter[0][0].size(); ++xf) {
-                        int xi = -input_padding_x + (int)xf * dilation_x + stride_x * (int)x;
-                        bool xi_inside = xi >= 0 && (int)input[0][0].size() > xi;
-                        if (!xi_inside) continue;
-
-                        auto input_val = static_cast<AccT>(input[f][yi][xi]);
-
-                        if (asymm_data) {
-                            input_val = input_val - static_cast<AccT>(data_zp[f]);
-                        }
+        for (size_t z = 0; z < (output_z - 2 * output_padding_z); ++z) {
+            for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
+                for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
+                    VF<AccT> values;
+                    values.reserve(filter[0].size() * filter[0][0].size() * filter[0][0][0].size());
+                    for (size_t zf = 0; zf < filter[0].size(); ++zf) {
+                        int zi = -input_padding_z + (int)zf * dilation_z + stride_z * (int)z;
+                        bool zi_inside = zi >= 0 && (int)input[0].size() > zi;
+                        if (!zi_inside) continue;
+                        for (size_t yf = 0; yf < filter[0][0].size(); ++yf) {
+                            int yi = -input_padding_y + (int)yf * dilation_y + stride_y * (int)y;
+                            bool yi_inside = yi >= 0 && (int)input[0][0].size() > yi;
+                            if (!yi_inside) continue;
+                            for (size_t xf = 0; xf < filter[0][0][0].size(); ++xf) {
+                                int xi = -input_padding_x + (int)xf * dilation_x + stride_x * (int)x;
+                                bool xi_inside = xi >= 0 && (int)input[0][0][0].size() > xi;
+                                if (!xi_inside) continue;
+
+                                auto input_val = static_cast<AccT>(input[f][zi][yi][xi]);
+
+                                if (asymm_data) {
+                                    input_val = input_val - static_cast<AccT>(data_zp[f]);
+                                }
 
-                        AccT weights_val;
-                        if (!depthwise && !grouped) {
-                            weights_val = static_cast<AccT>(filter[f][yf][xf]);
-                        } else if (grouped) {
-                            weights_val = static_cast<AccT>(filter[f - filter_begin][yf][xf]);
-                        }
-                        else {
-                            weights_val = static_cast<AccT>(filter[0][yf][xf]);
-                        }
+                                AccT weights_val;
+                                if (!depthwise && !grouped) {
+                                    weights_val = static_cast<AccT>(filter[f][zf][yf][xf]);
+                                } else if (grouped) {
+                                    weights_val = static_cast<AccT>(filter[f - filter_begin][zf][yf][xf]);
+                                }
+                                else {
+                                    weights_val = static_cast<AccT>(filter[0][zf][yf][xf]);
+                                }
 
-                        if (asymm_weights) {
-                            weights_val = weights_val - static_cast<AccT>(weights_zp);
-                        }
+                                if (asymm_weights) {
+                                    weights_val = weights_val - static_cast<AccT>(weights_zp);
+                                }
+
+                                //std::cout << std::endl << "f=" << f << ", z=" << z << ", y=" << y << ", x=" << x << ", zf=" << zf << ", yf=" << yf << ", xf=" << xf << ": " << (int)input_val << " * " << (int)weights_val;
 
-                        values.push_back(input_val * weights_val);
+                                values.push_back(input_val * weights_val);
+                            }
+                        }
                     }
+                    output[z + output_padding_z][y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(kahan_summation<AccT>(values));
                 }
-                output[y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(kahan_summation<AccT>(values));
             }
         }
     }
 
-    for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
-        for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
-            output[y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(bias);
+    for (size_t z = 0; z < (output_z - 2 * output_padding_z); ++z) {
+        for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
+            for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
+                output[z + output_padding_z][y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(bias);
+            }
         }
     }
     return output;
 }
 
+template<typename InputT, typename OutputT = InputT, typename WeightsT = InputT,  typename AccT = typename convolution_accumulator<InputT>::type>
+VVF<OutputT> reference_convolve(VVVF<InputT> &input, VVVF<WeightsT> &filter, int stride_y, int stride_x, float bias, int dilation_y = 1, int dilation_x = 1,
+        int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0,
+        int output_padding_x = 0, size_t f_begin = 0, size_t f_end = 0, bool depthwise = false, bool grouped = false,
+        const VF<InputT>& data_zp = {}, const WeightsT& weights_zp = 0)
+{
+    VVVVF<InputT> input_extended(input.size(), VVVF<InputT>(1, VVF<InputT>(input[0].size(), VF<InputT>(input[0][0].size(), 0))));
+    for (size_t fi = 0; fi < input.size(); fi++) {
+        for (size_t yi = 0; yi < input[0].size(); yi++) {
+            for (size_t xi = 0; xi < input[0][0].size(); xi++) {
+                input_extended[fi][0][yi][xi] = input[fi][yi][xi];
+            }
+        }
+    }
+
+    VVVVF<WeightsT> filter_extended(filter.size(), VVVF<WeightsT>(1, VVF<WeightsT>(filter[0].size(), VF<WeightsT>(filter[0][0].size(), 0))));
+    for (size_t fi = 0; fi < filter.size(); fi++) {
+        for (size_t yi = 0; yi < filter[0].size(); yi++) {
+            for (size_t xi = 0; xi < filter[0][0].size(); xi++) {
+                filter_extended[fi][0][yi][xi] = filter[fi][yi][xi];
+            }
+        }
+    }
+
+    VVVF<OutputT> output = reference_convolve<InputT, OutputT, WeightsT, AccT>(input_extended, filter_extended,
+        1, stride_y, stride_x,
+        bias,
+        1, dilation_y, dilation_x,
+        0, input_padding_y, input_padding_x,
+        0, output_padding_y, output_padding_x,
+        f_begin, f_end, depthwise, grouped,
+        data_zp, weights_zp);
+
+    VVF<OutputT> output_shrinked(output[0].size(), VF<OutputT>(output[0][0].size(), 0));
+
+    for (size_t yi = 0; yi < output[0].size(); yi++) {
+        for (size_t xi = 0; xi < output[0][0].size(); xi++) {
+            output_shrinked[yi][xi] = output[0][yi][xi];
+        }
+    }
+
+    return output_shrinked;
+}
+
 template <typename T>
-VVF<T> reference_scale_post_op(const VVF<T>& input, const T& scale, const T& shift) {
+VVVF<T> reference_scale_post_op(const VVVF<T>& input, const T& scale, const T& shift) {
     auto output = input;
-    auto size_y = input.size();
-    auto size_x = input[0].size();
-    for (size_t yi = 0; yi < size_y; ++yi) {
-        for (size_t xi = 0; xi < size_x; ++xi) {
-            output[yi][xi] = output[yi][xi] * scale + shift;
+    auto size_z = input.size();
+    auto size_y = input[0].size();
+    auto size_x = input[0][0].size();
+    for (size_t zi = 0; zi < size_z; ++zi) {
+        for (size_t yi = 0; yi < size_y; ++yi) {
+            for (size_t xi = 0; xi < size_x; ++xi) {
+                output[zi][yi][xi] = output[zi][yi][xi] * scale + shift;
+            }
         }
     }
     return output;
 }
 
+
+template <typename T>
+VVF<T> reference_scale_post_op(const VVF<T>& input, const T& scale, const T& shift) {
+    VVVF<T> input_extended(1, VVF<T>(input.size(), VF<T>(input[0].size(), 0)));
+    for (size_t yi = 0; yi < input.size(); yi++) {
+        for (size_t xi = 0; xi < input[0].size(); xi++) {
+            input_extended[0][yi][xi] = input[yi][xi];
+        }
+    }
+    VVVF<T> output = reference_scale_post_op<T>(input_extended, scale, shift);
+    VVF<T> output_shrinked(output[0].size(), VF<T>(output[0][0].size(), 0));
+    for (size_t yi = 0; yi < output[0].size(); yi++) {
+        for (size_t xi = 0; xi < output[0][0].size(); xi++) {
+            output_shrinked[yi][xi] = output[0][yi][xi];
+        }
+    }
+
+    return output_shrinked;
+}
+
 void dump_buffer(memory const& mem, std::string const& name)
 {
     std::ofstream out(name);
@@ -172,14 +258,17 @@ void dump_buffer(memory const& mem, std::string const& name)
         for (int f = 0; f < size.feature[0]; ++f)
         {
             out << "feature " << f << ":\n";
-            for (int y = 0; y < size.spatial[1]; ++y)
+            for (int z = 0; z < size.spatial[2]; ++z)
             {
-                for (int x = 0; x < size.spatial[0]; ++x)
+                for (int y = 0; y < size.spatial[1]; ++y)
                 {
-                    size_t idx = b * pitches.batch[0] + f * pitches.feature[0] + y * pitches.spatial[1] + x * pitches.spatial[0];
-                    out << ptr[idx] << " ";
+                    for (int x = 0; x < size.spatial[0]; ++x)
+                    {
+                        size_t idx = b * pitches.batch[0] + f * pitches.feature[0] + z * pitches.spatial[2] + y * pitches.spatial[1] + x * pitches.spatial[0];
+                        out << ptr[idx] << " ";
+                    }
+                    out << "\n";
                 }
-                out << "\n";
             }
 
             out << "\n";
@@ -4798,15 +4887,18 @@ using TestParamType_convolution_depthwise_gpu = ::testing::tuple<int,   // 0 - I
         bool>; // 6 - With bias
 
 using TestParamType_grouped_convolution_gpu = ::testing::tuple<  int,    // 0 - Input X size
-        int,        // 1 - Input Y size
-        int,        // 2 - Input features
-        int,        // 3 - Output features
-        int,        // 4 - Kernel sizeX
-        int,        // 5 - Kernel sizeY
-        int,        // 6 - Groups number
-        int,        // 7 - Stride
-        int,        // 8 - Batch
-        format>;    // 9 - Input data format
+        int,            // 1  - Input Y size
+        int,            // 2  - Input Z size
+        int,            // 3  - Input features
+        int,            // 4  - Output features
+        int,            // 5  - Kernel sizeX
+        int,            // 6  - Kernel sizeY
+        int,            // 7  - Kernel sizeZ
+        int,            // 8  - Groups number
+        int,            // 9  - Stride
+        int,            // 10  - Batch
+        format,         // 11  - Input data format
+        std::string>;   // 12 - Implementation name
 
 struct convolution_gpu : public ::testing::TestWithParam<TestParamType_convolution_gpu>
 {
@@ -4880,16 +4972,24 @@ struct convolution_grouped_gpu : public ::testing::TestWithParam<TestParamType_g
     static std::string PrintToStringParamName(
         testing::TestParamInfo<TestParamType_grouped_convolution_gpu> param_info) {
         // construct a readable name
-        return "in" + std::to_string(testing::get<0>(param_info.param)) + "x" +
-               std::to_string(testing::get<1>(param_info.param)) + "y" +
-               std::to_string(testing::get<2>(param_info.param)) + "f" +
-               "_output" + std::to_string(testing::get<3>(param_info.param)) + "f" +
-               "_filter" + std::to_string(testing::get<4>(param_info.param)) + "x" +
-                           std::to_string(testing::get<5>(param_info.param)) + "y" +
-               "_groups" + std::to_string(testing::get<6>(param_info.param)) +
-               "_stride" + std::to_string(testing::get<7>(param_info.param)) +
-               "_batch"  + std::to_string(testing::get<8>(param_info.param)) +
-               "_format" + std::to_string(testing::get<9>(param_info.param));
+        std::string res = "in" + std::to_string(testing::get<0>(param_info.param)) + "x" +
+            std::to_string(testing::get<1>(param_info.param)) + "y" +
+            std::to_string(testing::get<2>(param_info.param)) + "z" +
+            std::to_string(testing::get<3>(param_info.param)) + "f" +
+            "_output" + std::to_string(testing::get<4>(param_info.param)) + "f" +
+            "_filter" + std::to_string(testing::get<5>(param_info.param)) + "x" +
+            std::to_string(testing::get<6>(param_info.param)) + "y" +
+            std::to_string(testing::get<7>(param_info.param)) + "z" +
+            "_groups" + std::to_string(testing::get<8>(param_info.param)) +
+            "_stride" + std::to_string(testing::get<9>(param_info.param)) +
+            "_batch" + std::to_string(testing::get<10>(param_info.param)) +
+            "_format" + std::to_string(testing::get<11>(param_info.param));
+
+        if (testing::get<12>(param_info.param) != "") {
+            res += "_impl_" + testing::get<12>(param_info.param);
+        }
+
+        return res;
     }
 };
 
@@ -6957,26 +7057,47 @@ INSTANTIATE_TEST_CASE_P(convolution_depthwise_gpu_bfyx,
 INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
                         convolution_grouped_gpu,
                         ::testing::Values(
-                            // Input X size, Input Y size, Input features, Output features, Kernel size X, Kernel size Y,
-                            // Groups number, Stride, Output padding, Batch, Input data format
+                            // Input X size, Input Y size, Input Z size, Input features, Output features,
+                            // Kernel size X, Kernel size Y, Kernel size Z, Groups number, Stride, Batch,
+                            // Input data format, Implementation name
+
                             // Format: b_fs_yx_fsv4
-                            TestParamType_grouped_convolution_gpu(4, 4, 16, 17, 3, 3, 1, 1, 1, format::b_fs_yx_fsv4),
-                            TestParamType_grouped_convolution_gpu(4, 4, 16, 16, 3, 3, 4, 1, 1, format::b_fs_yx_fsv4),
-                            TestParamType_grouped_convolution_gpu(4, 4, 8, 4, 2, 2, 2, 1, 4, format::b_fs_yx_fsv4),
-                            TestParamType_grouped_convolution_gpu(8, 8, 16, 16, 4, 4, 4, 1, 1, format::b_fs_yx_fsv4),
-                            TestParamType_grouped_convolution_gpu(17, 17, 32, 96, 3, 3, 2, 2, 2, format::b_fs_yx_fsv4),
-                            TestParamType_grouped_convolution_gpu(16, 16, 8, 48, 2, 2, 2, 2, 1, format::b_fs_yx_fsv4),
-                            TestParamType_grouped_convolution_gpu(3, 3, 48, 96, 2, 2, 2, 8, 1, format::b_fs_yx_fsv4),
-                            TestParamType_grouped_convolution_gpu(6, 6, 8, 26, 3, 3, 2, 4, 1, format::b_fs_yx_fsv4),
+                            TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv4, ""),
+                            TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv4, ""),
+                            TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, format::b_fs_yx_fsv4, ""),
+                            TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, format::b_fs_yx_fsv4, ""),
+                            TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, format::b_fs_yx_fsv4, ""),
+                            TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, format::b_fs_yx_fsv4, ""),
+                            TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, format::b_fs_yx_fsv4, ""),
+                            TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv4, ""),
+
                             // Format: b_fs_yx_fsv16
-                            TestParamType_grouped_convolution_gpu(4, 4, 16, 17, 3, 3, 1, 1, 1, format::b_fs_yx_fsv16),
-                            TestParamType_grouped_convolution_gpu(4, 4, 16, 16, 3, 3, 4, 1, 1, format::b_fs_yx_fsv16),
-                            TestParamType_grouped_convolution_gpu(4, 4, 8, 4, 2, 2, 2, 1, 4, format::b_fs_yx_fsv16),
-                            TestParamType_grouped_convolution_gpu(8, 8, 16, 16, 4, 4, 4, 1, 1, format::b_fs_yx_fsv16),
-                            TestParamType_grouped_convolution_gpu(17, 17, 32, 96, 3, 3, 2, 2, 2, format::b_fs_yx_fsv16),
-                            TestParamType_grouped_convolution_gpu(16, 16, 8, 48, 2, 2, 2, 2, 1, format::b_fs_yx_fsv16),
-                            TestParamType_grouped_convolution_gpu(3, 3, 48, 96, 2, 2, 2, 8, 1, format::b_fs_yx_fsv16),
-                            TestParamType_grouped_convolution_gpu(6, 6, 8, 26, 3, 3, 2, 4, 1, format::b_fs_yx_fsv16)
+                            TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, format::b_fs_yx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, format::b_fs_yx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, format::b_fs_yx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, format::b_fs_yx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv16, ""),
+                            
+                            // Format: b_fs_zyx_fsv16
+                            TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 17, 3, 3, 3, 1, 1, 1, format::b_fs_zyx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 16, 3, 3, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(4, 4, 4, 8, 4, 2, 2, 2, 2, 1, 4, format::b_fs_zyx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(8, 8, 8, 16, 16, 4, 4, 4, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(17, 17, 17, 32, 96, 3, 3, 3, 2, 2, 2, format::b_fs_zyx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(16, 16, 16, 8, 48, 2, 2, 2, 2, 2, 1, format::b_fs_zyx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(3, 3, 3, 48, 96, 2, 2, 2, 2, 8, 1, format::b_fs_zyx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(6, 6, 6, 8, 26, 3, 3, 3, 2, 4, 1, format::b_fs_zyx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(7, 5, 3, 51, 99, 3, 3, 3, 3, 1, 1, format::b_fs_zyx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(8, 6, 4, 32, 64, 2, 2, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(33, 6, 4, 16, 32, 4, 3, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(33, 1, 1, 30, 62, 1, 1, 1, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 256, 2, 1, 2, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 512, 2, 1, 3, 16, 1, 1, format::b_fs_zyx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(1, 3, 1, 18, 2, 1, 3, 1, 2, 1, 1, format::b_fs_zyx_fsv16, "")
                         ),
                         convolution_grouped_gpu::PrintToStringParamName);
 
@@ -6985,42 +7106,66 @@ TEST_P(convolution_grouped_gpu, base) {
 
     const int input_x = testing::get<0>(GetParam()),
               input_y = testing::get<1>(GetParam()),
-              input_f = testing::get<2>(GetParam()),
-              output_f = testing::get<3>(GetParam()),
-              filter_x = testing::get<4>(GetParam()),
-              filter_y = testing::get<5>(GetParam()),
-              groups = testing::get<6>(GetParam()),
-              stride = testing::get<7>(GetParam()),
-              batch_num = testing::get<8>(GetParam()),
+              input_z = testing::get<2>(GetParam()),
+              input_f = testing::get<3>(GetParam()),
+              output_f = testing::get<4>(GetParam()),
+              filter_x = testing::get<5>(GetParam()),
+              filter_y = testing::get<6>(GetParam()),
+              filter_z = testing::get<7>(GetParam()),
+              groups = testing::get<8>(GetParam()),
+              stride = testing::get<9>(GetParam()),
+              batch_num = testing::get<10>(GetParam()),
               output_padding = 0,
-              input_offset_y = (filter_x - 1) / 2,
-              input_offset_x = (filter_y - 1) / 2;
-    auto input_data_format = testing::get<9>(GetParam());
-
-    auto input_size = tensor(batch(batch_num), feature(input_f), spatial(input_x, input_y));
-    auto input_rnd = generate_random_4d<uint8_t>(batch_num, input_f, input_y, input_x, 0, 255);
-    auto input_rnd_vec = flatten_4d<uint8_t>(format::bfyx, input_rnd);
-    auto input = memory::allocate(engine, {data_types::u8, format::bfyx, input_size});
-    set_values(input, input_rnd_vec);
+              input_offset_z = (filter_z - 1) / 2,
+              input_offset_y = (filter_y - 1) / 2,
+              input_offset_x = (filter_x - 1) / 2;
+    auto input_data_format = testing::get<11>(GetParam());
+    auto impl_name = testing::get<12>(GetParam());
+
+    auto num_in_spatial_dims = input_data_format.spatial_num();
+
+    auto input_size = tensor(batch(batch_num), feature(input_f), spatial(input_x, input_y, input_z));
+    auto input_rnd = generate_random_5d<uint8_t>(batch_num, input_f, input_z, input_y, input_x, 0, 255);
+
+    auto input_lay = layout(data_types::u8, format::bfzyx, input_size);
+    if (num_in_spatial_dims == 2) {
+        input_lay = layout(data_types::u8, format::bfyx, input_size);
+    }
+    std::vector<uint8_t> input_flat(input_lay.get_linear_size());
+    for (int b = 0; b < batch_num; b++)
+        for (int f = 0; f < input_f; f++)
+            for (int z = 0; z < input_z; z++)
+                for (int y = 0; y < input_y; y++)
+                    for (int x = 0; x < input_x; x++) {
+                        tensor coords = tensor(batch(b), feature(f), spatial(x, y, z, 0));
+                        size_t offset = input_lay.get_linear_offset(coords);
+                        input_flat[offset] = input_rnd[b][f][z][y][x];
+                    }
+    auto input = memory::allocate(engine, input_lay);
+    set_values(input, input_flat);
 
-    auto weights_size = tensor(group(groups), batch(output_f / groups), feature(input_f / groups), spatial(filter_x, filter_y));
-    VVVVVF<int8_t> weights_rnd = generate_random_5d<int8_t>(groups, output_f / groups, input_f / groups, filter_y, filter_x, -127, 127);
-    auto weights_lay = layout(data_types::i8, format::goiyx, weights_size);
+    auto weights_size = tensor(group(groups), batch(output_f / groups), feature(input_f / groups), spatial(filter_x, filter_y, filter_z));
 
+    VVVVVVF<int8_t> weights_rnd = generate_random_6d<int8_t>(groups, output_f / groups, input_f / groups, filter_z, filter_y, filter_x, -127, 127);
+    auto weights_lay = layout(data_types::i8, format::goizyx, weights_size);
+    if (num_in_spatial_dims == 2) {
+        weights_lay = layout(data_types::i8, format::goiyx, weights_size);
+    }
     std::vector<int8_t> weights_flat(weights_lay.get_linear_size());
     for (int gi = 0; gi < groups; ++gi)
         for (int ofi = 0; ofi < output_f / groups; ++ofi)
             for (int ifi = 0; ifi < input_f / groups; ++ifi)
-                for (int kyi = 0; kyi < filter_y; ++kyi)
-                    for (int kxi = 0; kxi < filter_x; ++kxi) {
-                        tensor coords = tensor(group(gi), batch(ofi), feature(ifi), spatial(kxi, kyi, 0, 0));
-                        size_t offset = weights_lay.get_linear_offset(coords);
-                        weights_flat[offset] = weights_rnd[gi][ofi][ifi][kyi][kxi];
-                    }
-    auto weights = memory::allocate(engine, {data_types::i8, format::goiyx, weights_size});
+                for (int kzi = 0; kzi < filter_z; ++kzi)
+                    for (int kyi = 0; kyi < filter_y; ++kyi)
+                        for (int kxi = 0; kxi < filter_x; ++kxi) {
+                            tensor coords = tensor(group(gi), batch(ofi), feature(ifi), spatial(kxi, kyi, kzi, 0));
+                            size_t offset = weights_lay.get_linear_offset(coords);
+                            weights_flat[offset] = weights_rnd[gi][ofi][ifi][kzi][kyi][kxi];
+                        }
+    auto weights = memory::allocate(engine, weights_lay);
     set_values(weights, weights_flat);
 
-    VVVVF<float> expected_result(batch_num, VVVF<float>(output_f));
+    VVVVVF<float> expected_result(batch_num, VVVVF<float>(output_f));
 
     // Calculate reference values without bias
     for (int bi = 0; bi < batch_num; ++bi)
@@ -7031,15 +7176,15 @@ TEST_P(convolution_grouped_gpu, base) {
                 int f_end = gi * input_f / groups + input_f / groups;
 
                 expected_result[bi][ofi + gi * output_f / groups] = reference_convolve<uint8_t, float, int8_t>(
-                    input_rnd[bi], weights_rnd[gi][ofi],  // input, weights
-                    stride, stride,                       // strides
-                    0,                                    // bias
-                    1, 1,                                 // dilation
-                    input_offset_y, input_offset_x,       // input padding
-                    0, 0,                                 // output_padding
-                    f_begin, f_end,                       // f_begin, f_end
-                    false,                                // depthwise
-                    grouped);                             // grouped
+                    input_rnd[bi], weights_rnd[gi][ofi],            // input, weights
+                    stride, stride, stride,                         // strides
+                    0,                                              // bias
+                    1, 1, 1,                                        // dilation
+                    input_offset_z, input_offset_y, input_offset_x, // input padding
+                    0, 0, 0,                                        // output_padding
+                    f_begin, f_end,                                 // f_begin, f_end
+                    false,                                          // depthwise
+                    grouped);                                       // grouped
             }
 
     topology topology(input_layout("input", input.get_layout()),
@@ -7049,14 +7194,14 @@ TEST_P(convolution_grouped_gpu, base) {
                                   "input_fsv",
                                   {"weights"},
                                   groups,
-                                  {1, 1, stride, stride},
-                                  {0, 0, -input_offset_x, -input_offset_y},
-                                  {1, 1, 1, 1},
-                                  padding({0, 0, output_padding, output_padding}, 0.f)));
+                                  tensor(batch(1), feature(1), spatial(stride, stride, stride, 1)),
+                                  tensor(batch(0), feature(0), spatial(-input_offset_x, -input_offset_y, -input_offset_z, 0)),
+                                  tensor(batch(1), feature(1), spatial(1, 1, 1, 1)),
+                                  padding({0, 0, output_padding, output_padding, output_padding}, 0.f)));
 
     build_options options;
     options.set_option(build_option::optimize_data(true));
-    implementation_desc conv_impl = {input_data_format, "fused_conv_eltwise_gpu_imad"};
+    implementation_desc conv_impl = {input_data_format, impl_name};
     options.set_option(build_option::force_implementations({{"conv", conv_impl}}));
 
     network network(engine, topology, options);
@@ -7070,24 +7215,26 @@ TEST_P(convolution_grouped_gpu, base) {
     ASSERT_EQ(out_mem.get_layout().format, input_data_format);
     ASSERT_EQ(out_lay.size.batch[0], expected_result.size());
     ASSERT_EQ(out_lay.size.feature[0], expected_result[0].size());
-    ASSERT_EQ(out_lay.size.spatial[1], expected_result[0][0].size());
-    ASSERT_EQ(out_lay.size.spatial[0], expected_result[0][0][0].size());
+    ASSERT_EQ(out_lay.size.spatial[2], expected_result[0][0].size());
+    ASSERT_EQ(out_lay.size.spatial[1], expected_result[0][0][0].size());
+    ASSERT_EQ(out_lay.size.spatial[0], expected_result[0][0][0][0].size());
 
     for (int bi = 0; bi < batch_num; ++bi)
         for (int ofi = 0; ofi < output_f; ++ofi)
-            for (int yi = 0; yi < (int)expected_result[0][0].size(); ++yi)
-                for (int xi = 0; xi < (int)expected_result[0][0][0].size(); ++xi) {
-                    tensor coords = tensor(batch(bi), feature(ofi), spatial(xi, yi, 0, 0));
-                    auto offset = out_lay.get_linear_offset(coords);
-                    auto val = out_ptr[offset];
-                    auto val_ref = expected_result[bi][ofi][yi][xi];
-                    auto equal = are_equal(val_ref, val, 1e-2f);
-                    if (!equal) {
-                        std::cout << "Value at batch: " << bi << ", output_f: " << ofi << ", y: " << yi << ", x: " << xi << " = " << val << std::endl;
-                        std::cout << "Reference value at batch: " << bi << ", output_f: " << ofi << ", y: " << yi << ", x: " << xi << " = " << val_ref << std::endl;
+            for (int zi = 0; zi < (int)expected_result[0][0].size(); ++zi)
+                for (int yi = 0; yi < (int)expected_result[0][0][0].size(); ++yi)
+                    for (int xi = 0; xi < (int)expected_result[0][0][0][0].size(); ++xi) {
+                        tensor coords = tensor(batch(bi), feature(ofi), spatial(xi, yi, zi, 0));
+                        auto offset = out_lay.get_linear_offset(coords);
+                        auto val = out_ptr[offset];
+                        auto val_ref = expected_result[bi][ofi][zi][yi][xi];
+                        auto equal = are_equal(val_ref, val, 1e-2f);
+                        if (!equal) {
+                            std::cout << "Value at batch: " << bi << ", output_f: " << ofi << ", z: " << zi << ", y: " << yi << ", x: " << xi << " = " << val << std::endl;
+                            std::cout << "Reference value at batch: " << bi << ", output_f: " << ofi << ", z: " << zi << ", y: " << yi << ", x: " << xi << " = " << val_ref << std::endl;
+                        }
+                        EXPECT_TRUE(equal);
                     }
-                    EXPECT_TRUE(equal);
-                }
 }
 
 template <typename InputT, typename WeightsT, typename OutputT>
index 787197c..2ac9813 100644 (file)
@@ -86,7 +86,7 @@ struct pooling_accumulator<InputT, pooling_mode::max> {
         _acc = max(_acc, val);
     }
 
-    output_t get(size_t /*pool_x*/, size_t /*pool_y*/) {
+    output_t get(size_t /*pool_x*/, size_t /*pool_y*/, size_t /*pool_z*/) {
         return static_cast<output_t>(_acc);
     }
 
@@ -105,7 +105,7 @@ struct pooling_accumulator<InputT, pooling_mode::average_no_padding> {
         _acc += static_cast<output_t>(val);
     }
 
-    output_t get(size_t /*pool_x*/, size_t /*pool_y*/) {
+    output_t get(size_t /*pool_x*/, size_t /*pool_y*/, size_t /*pool_z*/) {
         return _acc / _cnt;
     }
 
@@ -128,8 +128,8 @@ struct pooling_accumulator<InputT, pooling_mode::average> {
         _acc += static_cast<output_t>(val);
     }
 
-    output_t get(size_t pool_x, size_t pool_y) {
-        return static_cast<output_t>(_acc / static_cast<InputT>(pool_x * pool_y));
+    output_t get(size_t pool_x, size_t pool_y, size_t pool_z) {
+        return static_cast<output_t>(_acc / static_cast<InputT>(pool_x * pool_y * pool_z));
     }
 
     void reset() {
@@ -140,46 +140,59 @@ struct pooling_accumulator<InputT, pooling_mode::average> {
 };
 
 template <typename InputT, pooling_mode Mode>
-VVF<typename pooling_mode_output<InputT, Mode>::type> reference_pooling(const VVF<InputT>& input, size_t pool_x, size_t pool_y, int stride_x, int stride_y, int offset_x, int offset_y) {
+VVVF<typename pooling_mode_output<InputT, Mode>::type> reference_pooling(const VVVF<InputT>& input, size_t pool_x, size_t pool_y, size_t pool_z, int stride_x, int stride_y, int stride_z, int offset_x, int offset_y, int offset_z) {
     using output_t = typename pooling_mode_output<InputT, Mode>::type;
-    VVF<output_t> result;
-    auto size_x = input[0].size();
-    auto size_y = input.size();
+    VVVF<output_t> result;
+    auto size_x = input[0][0].size();
+    auto size_y = input[0].size();
+    auto size_z = input.size();
 
     auto accumulator = pooling_accumulator<InputT, Mode>();
 
-    for (int yi = offset_y; yi + static_cast<int>(pool_y) <= static_cast<int>(size_y) - offset_y; yi += stride_y) {
-        VF<output_t> result_row;
-        for (int xi = offset_x; xi + static_cast<int>(pool_x) <= static_cast<int>(size_x) - offset_x; xi += stride_x) {
-            accumulator.reset();
-            for (int fyi = 0; fyi < static_cast<int>(pool_y); ++fyi) {
-                int index_y = yi + fyi;
-                if (index_y < 0 || index_y >= static_cast<int>(size_y))
-                    continue;
-                for (int fxi = 0; fxi < static_cast<int>(pool_x); ++fxi) {
-                    int index_x = xi + fxi;
-                    if (index_x < 0 || index_x >= static_cast<int>(size_x))
+    for (int zi = offset_z; zi + static_cast<int>(pool_z) <= static_cast<int>(size_z) - offset_z; zi += stride_z) {
+        VVF<output_t> result_matrix;
+        for (int yi = offset_y; yi + static_cast<int>(pool_y) <= static_cast<int>(size_y) - offset_y; yi += stride_y) {
+            VF<output_t> result_row;
+            for (int xi = offset_x; xi + static_cast<int>(pool_x) <= static_cast<int>(size_x) - offset_x; xi += stride_x) {
+                accumulator.reset();
+                for (int fzi = 0; fzi < static_cast<int>(pool_z); ++fzi) {
+                    int index_z = zi + fzi;
+                    if (index_z < 0 || index_z >= static_cast<int>(size_z))
                         continue;
-
-                    auto input_val = input[static_cast<size_t>(index_y)][static_cast<size_t>(index_x)];
-                    accumulator.accumulate(input_val);
+                    for (int fyi = 0; fyi < static_cast<int>(pool_y); ++fyi) {
+                        int index_y = yi + fyi;
+                        if (index_y < 0 || index_y >= static_cast<int>(size_y))
+                            continue;
+                        for (int fxi = 0; fxi < static_cast<int>(pool_x); ++fxi) {
+                            int index_x = xi + fxi;
+                            if (index_x < 0 || index_x >= static_cast<int>(size_x))
+                                continue;
+
+                            auto input_val = input[static_cast<size_t>(index_z)][static_cast<size_t>(index_y)][static_cast<size_t>(index_x)];
+                            accumulator.accumulate(input_val);
+                        }
+                    }
                 }
+                result_row.push_back(accumulator.get(pool_x, pool_y, pool_z));
             }
-            result_row.push_back(accumulator.get(pool_x, pool_y));
+            result_matrix.emplace_back(std::move(result_row));
         }
-        result.emplace_back(std::move(result_row));
+        result.emplace_back(std::move(result_matrix));
     }
     return result;
 }
 
 template <typename T>
-VVF<T> reference_scale_post_op(const VVF<T>& input, const T& scale, const T& shift) {
+VVVF<T> reference_scale_post_op(const VVVF<T>& input, const T& scale, const T& shift) {
     auto output = input;
-    auto size_y = input.size();
-    auto size_x = input[0].size();
-    for (size_t yi = 0; yi < size_y; ++yi) {
-        for (size_t xi = 0; xi < size_x; ++xi) {
-            output[yi][xi] = output[yi][xi] * scale + shift;
+    auto size_z = input.size();
+    auto size_y = input[0].size();
+    auto size_x = input[0][0].size();
+    for (size_t zi = 0; zi < size_z; ++zi) {
+        for (size_t yi = 0; yi < size_y; ++yi) {
+            for (size_t xi = 0; xi < size_x; ++xi) {
+                output[zi][yi][xi] = output[zi][yi][xi] * scale + shift;
+            }
         }
     }
     return output;
@@ -2355,7 +2368,7 @@ public:
     using output_t = typename pooling_mode_output<InputT, Mode>::type;
 
     virtual topology build_topology(const engine& /*eng*/) {
-        auto input_size = tensor(batch(batch_num()), feature(input_features()), spatial(input_x(), input_y()));
+        auto input_size = tensor(batch(batch_num()), feature(input_features()), spatial(input_x(), input_y(), input_z()));
         auto input_lay = layout(input_type(),
                                 input_format(),
                                 input_size);
@@ -2365,9 +2378,9 @@ public:
             pooling("pool",
                     "input",
                     pool_mode(),
-                    tensor(batch(0), feature(0), spatial(pool_x(), pool_y())),
-                    tensor(batch(0), feature(0), spatial(stride_x(), stride_y())),
-                    tensor(batch(0), feature(0), spatial(offset_x(), offset_y())))
+                    tensor(batch(0), feature(0), spatial(pool_x(), pool_y(), pool_z())),
+                    tensor(batch(0), feature(0), spatial(stride_x(), stride_y(), stride_z())),
+                    tensor(batch(0), feature(0), spatial(offset_x(), offset_y(), offset_z())))
         );
         return topo;
     }
@@ -2376,7 +2389,8 @@ public:
         return "pool";
     }
 
-    virtual void run_expect(const VVVVF<output_t>& expected) {
+    virtual void run_expect(const VVVVVF<output_t>& expected) {
+
         auto eng = get_test_engine();
         auto topo = build_topology(eng);
         auto opts = build_options(
@@ -2384,7 +2398,7 @@ public:
         );
         auto net = network(eng, topo, opts);
 
-        auto input_size = tensor(batch(batch_num()), feature(input_features()), spatial(input_x(), input_y()));
+        auto input_size = tensor(batch(batch_num()), feature(input_features()), spatial(input_x(), input_y(), input_z()));
         auto input_lay = layout(input_type(),
                                 input_format(),
                                 input_size);
@@ -2392,12 +2406,13 @@ public:
         std::vector<InputT> input_flat(input_lay.get_linear_size(), static_cast<InputT>(0));
         for (size_t bi = 0; bi < batch_num(); ++bi)
             for (size_t fi = 0; fi < input_features(); ++fi)
-                for (size_t yi = 0; yi < input_y(); ++yi)
-                    for (size_t xi = 0; xi < input_x(); ++xi) {
-                        tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
-                        size_t offset = input_lay.get_linear_offset(coords);
-                        input_flat[offset] = _input[bi][fi][yi][xi];
-                    }
+                for (size_t zi = 0; zi < input_z(); ++zi)
+                    for (size_t yi = 0; yi < input_y(); ++yi)
+                        for (size_t xi = 0; xi < input_x(); ++xi) {
+                            tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, zi, 0));
+                            size_t offset = input_lay.get_linear_offset(coords);
+                            input_flat[offset] = _input[bi][fi][zi][yi][xi];
+                        }
         set_values(input_mem, input_flat);
 
         net.set_input_data("input", input_mem);
@@ -2418,35 +2433,37 @@ public:
         ASSERT_EQ(out_lay.data_type, output_type());
         ASSERT_EQ(out_lay.size.batch[0], expected.size());
         ASSERT_EQ(out_lay.size.feature[0], expected[0].size());
-        ASSERT_EQ(out_lay.size.spatial[1], expected[0][0].size());
-        ASSERT_EQ(out_lay.size.spatial[0], expected[0][0][0].size());
+        ASSERT_EQ(out_lay.size.spatial[2], expected[0][0].size());
+        ASSERT_EQ(out_lay.size.spatial[1], expected[0][0][0].size());
+        ASSERT_EQ(out_lay.size.spatial[0], expected[0][0][0][0].size());
 
         bool compare_with_tolerance = input_type() == data_types::f16;
 
         for (size_t bi = 0; bi < batch_num(); ++bi)
             for (size_t fi = 0; fi < expected[0].size(); ++fi)
-                for (size_t yi = 0; yi < expected[0][0].size(); ++yi)
-                    for (size_t xi = 0; xi < expected[0][0][0].size(); ++xi) {
-                        tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
-                        size_t offset = out_lay.get_linear_offset(coords);
-                        auto ref_val = static_cast<float>(expected[bi][fi][yi][xi]);
-                        auto actual_val = static_cast<float>(out_ptr[offset]);
-                        if (compare_with_tolerance) {
-                            auto tolerance = 1;
-                            ASSERT_NEAR(ref_val, actual_val, tolerance)
-                                << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi;
-                        } else {
-                            EXPECT_TRUE(are_equal(ref_val, actual_val))
-                                << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi;
+                for (size_t zi = 0; zi < expected[0][0].size(); ++zi)
+                    for (size_t yi = 0; yi < expected[0][0][0].size(); ++yi)
+                        for (size_t xi = 0; xi < expected[0][0][0][0].size(); ++xi) {
+                            tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, zi, 0));
+                            size_t offset = out_lay.get_linear_offset(coords);
+                            auto ref_val = static_cast<float>(expected[bi][fi][zi][yi][xi]);
+                            auto actual_val = static_cast<float>(out_ptr[offset]);
+                            if (compare_with_tolerance) {
+                                auto tolerance = 1;
+                                ASSERT_NEAR(ref_val, actual_val, tolerance)
+                                    << "at b= " << bi << ", f= " << fi << ", z= " << zi << ", y= " << yi << ", x= " << xi;
+                            } else {
+                                EXPECT_TRUE(are_equal(ref_val, actual_val))
+                                    << "at b= " << bi << ", f= " << fi << ", z= " << zi << ", y= " << yi << ", x= " << xi;
+                            }
                         }
-                    }
-
     }
 
     size_t batch_num() { return _input.size(); }
     size_t input_features() { return _input[0].size(); }
-    size_t input_x() { return _input[0][0][0].size(); }
-    size_t input_y() { return _input[0][0].size(); }
+    size_t input_x() { return _input[0][0][0][0].size(); }
+    size_t input_y() { return _input[0][0][0].size(); }
+    size_t input_z() { return _input[0][0].size(); }
 
     format::type input_format() { return _input_fmt; }
     data_types input_type() {
@@ -2460,46 +2477,52 @@ public:
     pooling_mode pool_mode() { return Mode; }
     size_t pool_x() { return _pool_x; }
     size_t pool_y() { return _pool_y; }
+    size_t pool_z() { return _pool_z; }
     int stride_x() { return _stride_x; }
     int stride_y() { return _stride_y; }
+    int stride_z() { return _stride_z; }
     int offset_x() { return _offset_x; }
     int offset_y() { return _offset_y; }
+    int offset_z() { return _offset_z; }
 
-    void set_input(format::type input_fmt, VVVVF<InputT> input_data) {
+    void set_input(format::type input_fmt, VVVVVF<InputT> input_data) {
         _input_fmt = input_fmt;
         _input = std::move(input_data);
     }
 
-    void set_pool_size(size_t x, size_t y) {
+    void set_pool_size(size_t x, size_t y, size_t z) {
         _pool_x = x;
         _pool_y = y;
+        _pool_z = z;
     }
 
-    void set_strides(int x, int y) {
+    void set_strides(int x, int y, int z) {
         _stride_x = x;
         _stride_y = y;
+        _stride_z = z;
     }
 
-    void set_offsets(int x, int y) {
+    void set_offsets(int x, int y, int z) {
         _offset_x = x;
         _offset_y = y;
+        _offset_z = z;
     }
 
-    VVVVF<InputT> _input;
+    VVVVVF<InputT> _input;
     format::type _input_fmt;
-    size_t _pool_x, _pool_y;
-    int _stride_x, _stride_y;
-    int _offset_x, _offset_y;
+    size_t _pool_x, _pool_y, _pool_z;
+    int _stride_x, _stride_y, _stride_z;
+    int _offset_x, _offset_y, _offset_z;
 };
 
 using pooling_random_test_params = std::tuple<
-    size_t,                      // batch
-    size_t,                      // features
-    std::tuple<size_t, size_t>,  // input x, y
-    std::tuple<size_t, size_t>,  // pool x, y
-    std::tuple<int, int>,        // stride x, y
-    std::tuple<int, int>,        // offset x, y
-    format::type                 // input format
+    size_t,                             // batch
+    size_t,                             // features
+    std::tuple<size_t, size_t, size_t>, // input x, y, z
+    std::tuple<size_t, size_t, size_t>, // pool x, y, z
+    std::tuple<int, int, int>,          // stride x, y, z
+    std::tuple<int, int, int>,          // offset x, y, z
+    format::type                        // input format
 >;
 
 template <typename InputT, pooling_mode Mode>
@@ -2508,44 +2531,47 @@ public:
     using parent = pooling_test_base<InputT, Mode>;
     using output_t = typename parent::output_t;
 
-    virtual VVVVF<output_t> calculate_reference() {
-        VVVVF<output_t> reference(this->batch_num(), VVVF<output_t>(this->input_features()));
+    virtual VVVVVF<output_t> calculate_reference() {
+        VVVVVF<output_t> reference(this->batch_num(), VVVVF<output_t>(this->input_features()));
         for (size_t bi = 0; bi < this->batch_num(); ++bi) {
             for (size_t fi = 0; fi < this->input_features(); ++fi) {
                 reference[bi][fi] = reference_pooling<InputT, Mode>(
                     this->_input[bi][fi],
                     this->pool_x(),
                     this->pool_y(),
+                    this->pool_z(),
                     this->stride_x(),
                     this->stride_y(),
+                    this->stride_z(),
                     this->offset_x(),
-                    this->offset_y());
+                    this->offset_y(),
+                    this->offset_z());
             }
         }
         return reference;
     }
 
     virtual void param_set_up(const pooling_random_test_params& params) {
-        size_t b, f, in_x, in_y, p_x, p_y;
-        int s_x, s_y, o_x, o_y;
+        size_t b, f, in_x, in_y, in_z, p_x, p_y, p_z;
+        int s_x, s_y, s_z, o_x, o_y, o_z;
         format::type in_fmt;
 
         std::forward_as_tuple(
             b,
             f,
-            std::forward_as_tuple(in_x, in_y),
-            std::forward_as_tuple(p_x, p_y),
-            std::forward_as_tuple(s_x, s_y),
-            std::forward_as_tuple(o_x, o_y),
+            std::forward_as_tuple(in_x, in_y, in_z),
+            std::forward_as_tuple(p_x, p_y, p_z),
+            std::forward_as_tuple(s_x, s_y, s_z),
+            std::forward_as_tuple(o_x, o_y, o_z),
             in_fmt
         ) = params;
 
-        auto input_data = generate_random_4d<InputT>(b, f, in_y, in_x, -256, 256);
+        auto input_data = generate_random_5d<InputT>(b, f, in_z, in_y, in_x, -256, 256);
 
         this->set_input(in_fmt, std::move(input_data));
-        this->set_pool_size(p_x, p_y);
-        this->set_strides(s_x, s_y);
-        this->set_offsets(o_x, o_y);
+        this->set_pool_size(p_x, p_y, p_z);
+        this->set_strides(s_x, s_y, s_z);
+        this->set_offsets(o_x, o_y, o_z);
     }
 
     void run_random(const pooling_random_test_params& params) {
@@ -2583,14 +2609,14 @@ TEST_P(pooling_random_test, avg_u8) {
 }
 
 INSTANTIATE_TEST_CASE_P(
-    smoke_low_precision,
+    smoke_low_precision_2d_spatial,
     pooling_random_test,
     testing::Combine(testing::Values(1, 2),
                      testing::Values(3, 8, 64),
-                     testing::Values(std::tuple<size_t, size_t>(12, 12), std::tuple<size_t, size_t>(24, 24)),
-                     testing::Values(std::tuple<size_t, size_t>(4, 4), std::tuple<size_t, size_t>(2, 2)),
-                     testing::Values(std::tuple<int, int>(2, 2)),
-                     testing::Values(std::tuple<int, int>(0, 0)),
+                     testing::Values(std::tuple<size_t, size_t, size_t>(12, 12, 1), std::tuple<size_t, size_t, size_t>(24, 24, 1)),
+                     testing::Values(std::tuple<size_t, size_t, size_t>(4, 4, 1), std::tuple<size_t, size_t, size_t>(2, 2, 1)),
+                     testing::Values(std::tuple<int, int, int>(2, 2, 1)),
+                     testing::Values(std::tuple<int, int, int>(0, 0, 0)),
                      testing::Values(format::yxfb,
                                      format::bfyx,
                                      format::byxf_af32,
@@ -2600,15 +2626,28 @@ INSTANTIATE_TEST_CASE_P(
                     testing::internal::DefaultParamName<pooling_random_test_params>);
 
 INSTANTIATE_TEST_CASE_P(
+    smoke_low_precision_3d_spatial,
+    pooling_random_test,
+    testing::Combine(testing::Values(1, 2),
+                     testing::Values(3, 8, 64),
+                     testing::Values(std::tuple<size_t, size_t, size_t>(12, 12, 12), std::tuple<size_t, size_t, size_t>(24, 24, 24)),
+                     testing::Values(std::tuple<size_t, size_t, size_t>(4, 4, 4), std::tuple<size_t, size_t, size_t>(2, 2, 2)),
+                     testing::Values(std::tuple<int, int, int>(2, 2, 2)),
+                     testing::Values(std::tuple<int, int, int>(0, 0, 0)),
+                     testing::Values(format::bfzyx,
+                                     format::b_fs_zyx_fsv16)),
+                    testing::internal::DefaultParamName<pooling_random_test_params>);
+
+INSTANTIATE_TEST_CASE_P(
     batched_low_precision,
     pooling_random_test,
     testing::Combine(
         testing::Values(16),
         testing::Values(16, 32),
-        testing::Values(std::tuple<size_t, size_t>(3, 3), std::tuple<size_t, size_t>(8, 8)),
-        testing::Values(std::tuple<size_t, size_t>(1, 1), std::tuple<size_t, size_t>(3, 3)),
-        testing::Values(std::tuple<int, int>(1, 1)),
-        testing::Values(std::tuple<int, int>(0, 0)),
+        testing::Values(std::tuple<size_t, size_t, size_t>(3, 3, 1), std::tuple<size_t, size_t, size_t>(8, 8, 1)),
+        testing::Values(std::tuple<size_t, size_t, size_t>(1, 1, 1), std::tuple<size_t, size_t, size_t>(3, 3, 1)),
+        testing::Values(std::tuple<int, int, int>(1, 1, 1)),
+        testing::Values(std::tuple<int, int, int>(0, 0, 0)),
         testing::Values(format::bs_fs_yx_bsv16_fsv16)
     ),
     testing::internal::DefaultParamName<pooling_random_test_params>);
@@ -2622,7 +2661,7 @@ public:
     topology build_topology(const engine& eng) override {
         topology topo = parent::build_topology(eng);
 
-        auto scale_lay = layout(this->output_type(), format::bfyx, tensor(batch(1), feature(this->input_features()), spatial(1, 1)));
+        auto scale_lay = layout(this->output_type(), format::bfyx, tensor(batch(1), feature(this->input_features()), spatial(1, 1, 1, 1)));
         auto scale_mem = memory::allocate(eng, scale_lay);
         auto shift_mem = memory::allocate(eng, scale_lay);
         set_values(scale_mem, _scale);
@@ -2640,7 +2679,7 @@ public:
         return "scale_wa_out";
     }
 
-    VVVVF<output_t> calculate_reference() override {
+    VVVVVF<output_t> calculate_reference() override {
         auto expected = parent::calculate_reference();
 
         for (size_t bi = 0; bi < this->batch_num(); ++bi)
@@ -2688,10 +2727,10 @@ INSTANTIATE_TEST_CASE_P(
     pooling_random_test_fp16_fp32,
     testing::Combine(testing::Values(1, 2),
                      testing::Values(3, 8),
-                     testing::Values(std::tuple<size_t, size_t>(12, 12), std::tuple<size_t, size_t>(24, 24)),
-                     testing::Values(std::tuple<size_t, size_t>(4, 4), std::tuple<size_t, size_t>(2, 2)),
-                     testing::Values(std::tuple<int, int>(2, 2)),
-                     testing::Values(std::tuple<int, int>(0, 0)),
+                     testing::Values(std::tuple<size_t, size_t, size_t>(12, 12, 1), std::tuple<size_t, size_t, size_t>(24, 24, 1)),
+                     testing::Values(std::tuple<size_t, size_t, size_t>(4, 4, 1), std::tuple<size_t, size_t, size_t>(2, 2, 1)),
+                     testing::Values(std::tuple<int, int, int>(2, 2, 1)),
+                     testing::Values(std::tuple<int, int, int>(0, 0, 0)),
                      testing::Values(format::yxfb,
                                      format::bfyx,
                                      format::byxf,