[IE CLDNN] First conv 3d imad (#1935)
authorVladimir Paramuzov <vladimir.paramuzov@intel.com>
Tue, 1 Sep 2020 16:57:13 +0000 (19:57 +0300)
committerGitHub <noreply@github.com>
Tue, 1 Sep 2020 16:57:13 +0000 (19:57 +0300)
14 files changed:
inference-engine/thirdparty/clDNN/api/tensor.hpp
inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.cpp
inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.h
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_weights.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp
inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp
inference-engine/thirdparty/clDNN/src/include/to_string_utils.h
inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp

index 71c7c3e..ed49673 100644 (file)
@@ -168,6 +168,7 @@ struct format {
         os_is_zyx_osv16_isv16,                        ///< format for weights for IMAD convolutions
         os_is_yx_osv32_isv4_swizzled_by_2,            ///< format for weights for IMAD convolutions
         os_is_yx_osv32_isv4,                          ///< format for weights for IMAD convolutions
+        os_is_zyx_osv32_isv4,                         ///< format for weights for IMAD convolutions
         os_is_yx_osv32_isv32p,                        ///< format for weights for binary convolutions
         lstm_weights_dio,                             ///< dynamic_lstm, direction,
                                                       ///< than IO (I - input size, O - 4 * hidden_size)
@@ -262,6 +263,7 @@ struct format {
                 { os_is_zyx_osv16_isv16,                       { 1, 1, 3, 0, 0, "bfzyx",  "bfxyz",      {{0, 16}, {1, 16}}}},
                 { os_is_yx_osv32_isv4_swizzled_by_2,           { 1, 1, 2, 0, 0, "bfxy",   "bfxy?",      {{0, 32}, {1, 4}}}},
                 { os_is_yx_osv32_isv4,                         { 1, 1, 2, 0, 0, "bfxy",   "bfxy?",      {{0, 32}, {1, 4}}}},
+                { os_is_zyx_osv32_isv4,                        { 1, 1, 3, 0, 0, "bfzyx",  "bfxyz",      {{0, 32}, {1, 4}}}},
                 { os_is_yx_osv32_isv32p,                       { 1, 1, 1, 0, 0, "bfxy",   "bfxy?",      {}}},
                 { os_is_zyx_isv16_osv16,                       { 1, 1, 3, 0, 0, "bfzyx",  "bfxyz",      {{0, 16}, {1, 16}}}},
                 { is_os_zyx_isv16_osv16,                       { 1, 1, 3, 0, 0, "fbzyx",  "bfxyz",      {{1, 16}, {0, 16}}}},
index e8f3b67..3028850 100644 (file)
@@ -96,6 +96,7 @@ WeightsTensor::WeightsChannelArray WeightsTensor::weightsChannelArray {{
     { WeightsLayout::os_is_yx_osv16_isv4,                         {  0,  1, -1,   2,   3, -1, -1, -1 } },
     { WeightsLayout::os_is_yx_osv32_isv4_swizzled_by_2,           {  0,  1, -1,   2,   3, -1, -1, -1 } },
     { WeightsLayout::os_is_yx_osv32_isv4,                         {  0,  1, -1,   2,   3, -1, -1, -1 } },
+    { WeightsLayout::os_is_zyx_osv32_isv4,                        {  0,  1,  2,   3,   4, -1, -1, -1 } },
     { WeightsLayout::oizyx,                                       {  0,  1,  2,   3,   4, -1, -1, -1 } },
     { WeightsLayout::os_is_yx_osv32_isv32p,                       {  0,  1, -1,   2,   3, -1, -1, -1 } },
     { WeightsLayout::os_is_zyx_isv16_osv16,                       {  0,  1,  2,   3,   4, -1, -1, -1 } },
@@ -494,6 +495,11 @@ NDims WeightsTensor::GetSimpleDims(const std::vector<size_t>& d, WeightsLayout l
             newDims[2] = RoundUp(newDims[2], 4);
             newDims[3] = RoundUp(newDims[3], 32);
             break;
+        case os_is_zyx_osv32_isv4:
+            assert(newDims.size() == 5);
+            newDims[3] = RoundUp(newDims[3], 4);
+            newDims[4] = RoundUp(newDims[4], 32);
+            break;
         case os_is_yx_osv32_isv32p:
             assert(newDims.size() == 4);
             newDims[2] = RoundUp(newDims[2], 32);  // ic
index af5c8a5..abfc6a3 100644 (file)
@@ -120,6 +120,7 @@ enum WeightsLayout {
     os_is_yx_osv16_isv4,                 // swizzled weights for convolution using IMAD
     os_is_yx_osv32_isv4_swizzled_by_2,   //  weights for bfyx -> b_fs_yx_fsv32 convolution using IMAD with swizzeled ofm (0, 2, 4..), (1, 3, 5...)
     os_is_yx_osv32_isv4,                 //  weights for bfyx -> b_fs_yx_fsv{32,16} convolution using IMAD
+    os_is_zyx_osv32_isv4,                //  weights for bfzyx -> b_fs_zyx_fsv16 convolution using IMAD
     oizyx,
     os_is_yx_osv32_isv32p,  // 2 blocks: 32 packed binary in channels and 32 output channels
     os_is_osv32_isv32_swizzled_by_4,     // for weights for 1x1 IMAD convolution
index 335ba9d..cf352d5 100644 (file)
@@ -36,9 +36,11 @@ ParamsKey ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv32::GetSupportedKey() const
     k.EnableInputWeightsType(WeightsType::INT8);
 
     k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableInputLayout(DataLayout::bfzyx);
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
     k.EnableTensorOffset();
     k.EnableTensorPitches();
     k.EnableDilation();
@@ -64,6 +66,9 @@ bool ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv32::Validate(const Params &p, con
 
     auto params = dynamic_cast<const convolution_params&>(p);
 
+    if (params.inputs[0].Dimentions() != params.output.Dimentions())
+        return false;
+
     if (params.inputs[0].Feature().v != 3 && params.inputs[0].Feature().v != 4)
         return false;
 
@@ -128,7 +133,7 @@ ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv32::AutoTuneOption ConvolutionKernel_m
 
 static size_t get_slm_byte_size(const convolution_params &cp, size_t lws, size_t block_size) {
     return (cp.stride.x * (lws * block_size - 1) + (cp.weights.X().v - 1) * cp.dilation.x + 1)*
-            cp.weights.Y().v * sizeof(int32_t);
+            cp.weights.Y().v * cp.weights.Z().v * sizeof(int32_t);
 }
 
 static size_t get_lws(const convolution_params &cp, size_t blocks_count, size_t block_size, size_t max_lws) {
@@ -157,7 +162,7 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv32
     const size_t max_lws = std::max((size_t)1, cp.engineInfo.maxWorkGroupSize / sub_group_size);
     runInfo.gws0 = Align(cp.output.Feature().v, 32) / 2;
     runInfo.gws1 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth);
-    runInfo.gws2 = cp.output.Batch().v * cp.output.Y().v;
+    runInfo.gws2 = cp.output.Batch().v * cp.output.Y().v * cp.output.Z().v;
 
     runInfo.lws0 = sub_group_size;
     runInfo.lws1 = get_lws(cp, runInfo.gws1, tuneOptions.blockWidth, max_lws);
@@ -184,8 +189,7 @@ JitConstants ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv32::GetJitConstants(const
     size_t slm_tail = slm_line_size % runInfo.lws1;
     size_t slm_line_aligned = slm_chunk_size*runInfo.lws1 + Align(slm_tail, sub_group_size);
 
-    size_t input_line_size = std::min(params.stride.x * (blockWidth - 1) + (params.weights.X().v - 1) * params.dilation.x + 1,
-                                      input.X().v + input.X().pad.Total());
+    size_t input_line_size = params.stride.x * (blockWidth - 1) + (params.weights.X().v - 1) * params.dilation.x + 1;
 
     jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
     jit.AddConstant(MakeJitConstant("OUTPUT_X_BLOCK_SIZE", blockWidth));
@@ -199,14 +203,20 @@ JitConstants ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv32::GetJitConstants(const
 
     if (!params.fused_ops.empty()) {
         auto input_dt = GetActivationType(params);
-        if (GetPreferredWeightsLayout(params) == WeightsLayout::os_is_yx_osv32_isv4) {
-            FusedOpsConfiguration conf0 = {"_0", {"b", "(fg*32 + lid)", "y", "(x+i)"}, "res0", input_dt, 1};
-            FusedOpsConfiguration conf1 = {"_1", {"b", "(fg*32 + lid+16)", "y", "(x+i)"}, "res1", input_dt, 1};
+        if (WeightsTensor::ChannelsCount(GetPreferredWeightsLayout(params)) == 5) {
+            FusedOpsConfiguration conf0 = {"_0", {"b", "(fg*32 + lid)", "z", "y", "(x+i)"}, "res0", input_dt, 1};
+            FusedOpsConfiguration conf1 = {"_1", {"b", "(fg*32 + lid+16)", "z", "y", "(x+i)"}, "res1", input_dt, 1};
             jit.Merge(MakeFusedOpsJitConstants(params, {conf0, conf1}));
         } else {
-            FusedOpsConfiguration conf0 = {"_0", {"b", "(fg*32 + 2*lid + 0)", "y", "(x+i)"}, "res0", input_dt, 1};
-            FusedOpsConfiguration conf1 = {"_1", {"b", "(fg*32 + 2*lid + 1)", "y", "(x+i)"}, "res1", input_dt, 1};
-            jit.Merge(MakeFusedOpsJitConstants(params, {conf0, conf1}));
+            if (GetPreferredWeightsLayout(params) == WeightsLayout::os_is_yx_osv32_isv4) {
+                FusedOpsConfiguration conf0 = {"_0", {"b", "(fg*32 + lid)", "y", "(x+i)"}, "res0", input_dt, 1};
+                FusedOpsConfiguration conf1 = {"_1", {"b", "(fg*32 + lid+16)", "y", "(x+i)"}, "res1", input_dt, 1};
+                jit.Merge(MakeFusedOpsJitConstants(params, {conf0, conf1}));
+            } else {
+                FusedOpsConfiguration conf0 = {"_0", {"b", "(fg*32 + 2*lid + 0)", "y", "(x+i)"}, "res0", input_dt, 1};
+                FusedOpsConfiguration conf1 = {"_1", {"b", "(fg*32 + 2*lid + 1)", "y", "(x+i)"}, "res1", input_dt, 1};
+                jit.Merge(MakeFusedOpsJitConstants(params, {conf0, conf1}));
+            }
         }
     }
 
index 74dd382..7ed3da4 100644 (file)
@@ -37,8 +37,12 @@ protected:
     DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
     WeightsLayout GetPreferredWeightsLayout(const convolution_params &p) const override {
         if (p.output.GetDType() == Datatype::F16 || p.output.GetDType() == Datatype::F32 ||
-            p.output.GetLayout() == DataLayout::b_fs_yx_fsv16) {
-            return WeightsLayout::os_is_yx_osv32_isv4;
+            p.output.GetLayout() == DataLayout::b_fs_yx_fsv16 || p.output.GetLayout() == DataLayout::b_fs_zyx_fsv16) {
+            if (p.output.Dimentions() == 5) {
+                return WeightsLayout::os_is_zyx_osv32_isv4;
+            } else {
+                return WeightsLayout::os_is_yx_osv32_isv4;
+            }
         } else {
             return WeightsLayout::os_is_yx_osv32_isv4_swizzled_by_2;
         }
index a26ca07..45dd014 100644 (file)
@@ -95,8 +95,16 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
 {
     const int fg = get_group_id(0);
     const int x = (int)get_global_id(1) * OUTPUT_X_BLOCK_SIZE;
+
+#if OUTPUT_DIMS == 4
     const int b = (int)get_global_id(2) / OUTPUT_SIZE_Y;
+    const int z = 0;
+    const int y = (int)get_global_id(2) % OUTPUT_SIZE_Y;
+#elif OUTPUT_DIMS == 5
+    const int b = (int)get_global_id(2) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z;
+    const int z = (int)get_global_id(2) / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z;
     const int y = (int)get_global_id(2) % OUTPUT_SIZE_Y;
+#endif // OUTPUT_DIMS == 4
 
     const int lid = get_sub_group_local_id();
     const int group_id = get_group_id(1);
@@ -104,14 +112,15 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
 
     const int x_wg_start = (group_id * GROUP_SIZE) * STRIDE_SIZE_X - PADDING_SIZE_X;
     const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
+    const int input_z = z * STRIDE_SIZE_Z - PADDING_SIZE_Z;
 
     ACCUMULATOR_TYPE_VEC acc[2] = { 0 }; // 2*16 packed channels * OUTPUT_X_BLOCK_SIZE
 #if ASYMMETRIC_WEIGHTS_QUANTIZATION
     ACCUMULATOR_TYPE_VEC acc_assym_weights = 0;
 #endif
 
-#if INPUT0_LAYOUT_BFYX
-    const int input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + input_y * INPUT0_Y_PITCH;
+#if INPUT0_LAYOUT_BFYX || INPUT0_LAYOUT_BFZYX
+    const int input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + input_y * INPUT0_Y_PITCH + input_z * INPUT0_Z_PITCH;
 #elif INPUT0_LAYOUT_B_FS_YX_FSV4
     const int fsv = 4;
     const int input_x_pitch = fsv;
@@ -122,7 +131,7 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
     const int input_offset = b * input_b_pitch + input_y * input_y_pitch;
 #endif
 
-    int filter_idx = fg * FILTER_SIZE_X * FILTER_SIZE_Y * ISV * OSV;
+    int filter_idx = fg * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z * ISV * OSV;
 #if ASYMMETRIC_WEIGHTS_QUANTIZATION
     char4 multiplier;
     for (int i = 0; i < INPUT0_FEATURE_NUM; i++)
@@ -136,122 +145,135 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
 #endif // INPUT0_FEATURE_NUM == 3
 #endif // ASYMMETRIC_DATA_QUANTIZATION
 
-    __local PACKED_IN_TYPE slm[SLM_LINE_SIZE*FILTER_SIZE_Y];
+    __local PACKED_IN_TYPE slm[SLM_LINE_SIZE*FILTER_SIZE_Y*FILTER_SIZE_Z];
 
-    for (int kh = 0; kh < FILTER_SIZE_Y ; ++kh) {
-        __local PACKED_IN_TYPE* slm_block = slm + kh*SLM_LINE_SIZE + sg*SLM_CHUNK_SIZE;
-        bool y_cross_fm = input_y + kh*DILATION_SIZE_Y < 0 || input_y + kh*DILATION_SIZE_Y >= INPUT0_SIZE_Y;
-        if (y_cross_fm) {
+    for (int kd = 0; kd < FILTER_SIZE_Z ; ++kd) {
+        bool z_cross_fm = input_z + kd*DILATION_SIZE_Z < 0 || input_z + kd*DILATION_SIZE_Z >= INPUT0_SIZE_Z;
+        for (int kh = 0; kh < FILTER_SIZE_Y ; ++kh) {
+            __local PACKED_IN_TYPE* slm_block = slm + kh*SLM_LINE_SIZE + kd*SLM_LINE_SIZE*FILTER_SIZE_Y + sg*SLM_CHUNK_SIZE;
+            bool y_cross_fm = input_y + kh*DILATION_SIZE_Y < 0 || input_y + kh*DILATION_SIZE_Y >= INPUT0_SIZE_Y;
+            if (y_cross_fm || z_cross_fm) {
 #if ASYMMETRIC_DATA_QUANTIZATION
-            for (int c = 0; c < SLM_CHUNK_SIZE; c += SUB_GROUP_SIZE) {
-                if (sg*SLM_CHUNK_SIZE + c + lid < SLM_LINE_SIZE)
-                    slm_block[c + lid] = AS_PACKED_IN_TYPE(zp);
-            }
+                for (int c = 0; c < SLM_CHUNK_SIZE; c += SUB_GROUP_SIZE) {
+                    if (sg*SLM_CHUNK_SIZE + c + lid < SLM_LINE_SIZE)
+                        slm_block[c + lid] = AS_PACKED_IN_TYPE(zp);
+                }
 #if SLM_TAIL > 0
-            if (sg == LWS1 - 1) {
-                __local PACKED_IN_TYPE* slm_block_tail = slm + kh*SLM_LINE_SIZE + LWS1*SLM_CHUNK_SIZE;
-                slm_block_tail[lid] = AS_PACKED_IN_TYPE(zp);
-            }
+                if (sg == LWS1 - 1) {
+                    __local PACKED_IN_TYPE* slm_block_tail = slm + kh*SLM_LINE_SIZE + kd*SLM_LINE_SIZE*FILTER_SIZE_Y + LWS1*SLM_CHUNK_SIZE;
+                    slm_block_tail[lid] = AS_PACKED_IN_TYPE(zp);
+                }
 #endif // SLM_TAIL > 0
 #endif // ASYMMETRIC_DATA_QUANTIZATION
-            continue;
-        }
+                continue;
+            }
 
-        {
-            for (int c = 0; c < SLM_CHUNK_SIZE; c += SUB_GROUP_SIZE) {
-                const int x_chunk = x_wg_start + sg*SLM_CHUNK_SIZE + c;
-                bool x_cross_fm = x_chunk + lid < 0 || x_chunk + lid >= INPUT0_SIZE_X;
-
-                if (!x_cross_fm) {
-                #if INPUT0_LAYOUT_BFYX
-                    MAKE_VECTOR_TYPE(INPUT0_TYPE, ISV) src = 0;
-                    __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM)))
-                    for (int i = 0; i < INPUT0_FEATURE_NUM; i++) {
-                        src[i] = input[input_offset + i * INPUT0_FEATURE_PITCH
-                                                    + kh * DILATION_SIZE_Y * INPUT0_Y_PITCH
-                                                    + (x_chunk + lid)* INPUT0_X_PITCH];
-                    }
-                    slm_block[c + lid] = AS_PACKED_IN_TYPE(src);
-                #elif INPUT0_LAYOUT_B_FS_YX_FSV4
-                    const __global uint* ptr = input + input_offset + kh * DILATION_SIZE_Y * input_y_pitch + (x_chunk + lid) * input_x_pitch;
-                    PACKED_IN_TYPE src = AS_PACKED_IN_TYPE(ptr[0]);
-                    slm_block[c + lid] = src;
-                #endif
-                } else {
+            {
+                for (int c = 0; c < SLM_CHUNK_SIZE; c += SUB_GROUP_SIZE) {
+                    const int x_chunk = x_wg_start + sg*SLM_CHUNK_SIZE + c;
+                    bool x_cross_fm = x_chunk + lid < 0 || x_chunk + lid >= INPUT0_SIZE_X;
+
+                    if (!x_cross_fm) {
+                    #if INPUT0_LAYOUT_BFYX || INPUT0_LAYOUT_BFZYX
+                        MAKE_VECTOR_TYPE(INPUT0_TYPE, ISV) src = 0;
+                        __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM)))
+                        for (int i = 0; i < INPUT0_FEATURE_NUM; i++) {
+                            src[i] = input[input_offset + i * INPUT0_FEATURE_PITCH
+                                                        + kd * DILATION_SIZE_Z * INPUT0_Z_PITCH
+                                                        + kh * DILATION_SIZE_Y * INPUT0_Y_PITCH
+                                                        + (x_chunk + lid)* INPUT0_X_PITCH];
+                        }
+                        slm_block[c + lid] = AS_PACKED_IN_TYPE(src);
+                    #elif INPUT0_LAYOUT_B_FS_YX_FSV4
+                        const __global uint* ptr = input + input_offset + kh * DILATION_SIZE_Y * input_y_pitch + (x_chunk + lid) * input_x_pitch;
+                        PACKED_IN_TYPE src = AS_PACKED_IN_TYPE(ptr[0]);
+                        slm_block[c + lid] = src;
+                    #endif
+                    } else {
 #if ASYMMETRIC_DATA_QUANTIZATION
-                    slm_block[c + lid] = AS_PACKED_IN_TYPE(zp);
+                        slm_block[c + lid] = AS_PACKED_IN_TYPE(zp);
 #else  // ASYMMETRIC_DATA_QUANTIZATION
-                    slm_block[c + lid] = 0;
+                        slm_block[c + lid] = 0;
 #endif  // ASYMMETRIC_DATA_QUANTIZATION
-                }
-            }
+                   }
+               }
 #if SLM_TAIL > 0
-            if (sg == LWS1 - 1) {
-                __local PACKED_IN_TYPE* slm_block_tail = slm + kh*SLM_LINE_SIZE + LWS1*SLM_CHUNK_SIZE;
-                const int x_chunk = x_wg_start + LWS1*SLM_CHUNK_SIZE;
-                bool x_cross_fm = x_chunk + lid >= INPUT0_SIZE_X;
-                if (!x_cross_fm) {
-                #if INPUT0_LAYOUT_BFYX
-                    MAKE_VECTOR_TYPE(INPUT0_TYPE, ISV) src = 0;
-                    __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM)))
-                    for (int i = 0; i < INPUT0_FEATURE_NUM; i++) {
-                        src[i] = input[input_offset + i * INPUT0_FEATURE_PITCH
-                                                    + kh * DILATION_SIZE_Y * INPUT0_Y_PITCH
-                                                    + (x_chunk + lid)* INPUT0_X_PITCH];
-                    }
-                    slm_block_tail[lid] = AS_PACKED_IN_TYPE(src);
-                #elif INPUT0_LAYOUT_B_FS_YX_FSV4
-                    const __global uint* ptr = input + input_offset + kh * DILATION_SIZE_Y * input_y_pitch + (x_chunk + lid) * input_x_pitch;
-                    PACKED_IN_TYPE src = AS_PACKED_IN_TYPE(ptr[0]);
-                    slm_block_tail[lid] = src;
-                #endif
-                } else {
+                if (sg == LWS1 - 1) {
+                    __local PACKED_IN_TYPE* slm_block_tail = slm + kh*SLM_LINE_SIZE + kd*SLM_LINE_SIZE*FILTER_SIZE_Y + LWS1*SLM_CHUNK_SIZE;
+                    const int x_chunk = x_wg_start + LWS1*SLM_CHUNK_SIZE;
+                    bool x_cross_fm = x_chunk + lid >= INPUT0_SIZE_X;
+                    if (!x_cross_fm) {
+                    #if INPUT0_LAYOUT_BFYX || INPUT0_LAYOUT_BFZYX
+                        MAKE_VECTOR_TYPE(INPUT0_TYPE, ISV) src = 0;
+                        __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM)))
+                        for (int i = 0; i < INPUT0_FEATURE_NUM; i++) {
+                            src[i] = input[input_offset + i * INPUT0_FEATURE_PITCH
+                                                        + kd * DILATION_SIZE_Z * INPUT0_Z_PITCH
+                                                        + kh * DILATION_SIZE_Y * INPUT0_Y_PITCH
+                                                        + (x_chunk + lid)* INPUT0_X_PITCH];
+                        }
+                        slm_block_tail[lid] = AS_PACKED_IN_TYPE(src);
+                    #elif INPUT0_LAYOUT_B_FS_YX_FSV4
+                        const __global uint* ptr = input + input_offset + kh * DILATION_SIZE_Y * input_y_pitch + (x_chunk + lid) * input_x_pitch;
+                        PACKED_IN_TYPE src = AS_PACKED_IN_TYPE(ptr[0]);
+                        slm_block_tail[lid] = src;
+                    #endif
+                    } else {
 #if ASYMMETRIC_DATA_QUANTIZATION
                     slm_block_tail[lid] = AS_PACKED_IN_TYPE(zp);
 #else  // ASYMMETRIC_DATA_QUANTIZATION
                     slm_block_tail[lid] = 0;
 #endif  // ASYMMETRIC_DATA_QUANTIZATION
+                    }
                 }
-            }
 #endif
+            }
         }
     }
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
-    for (int kh = 0; kh < FILTER_SIZE_Y ; ++kh) {
-        bool y_cross_fm = input_y + kh*DILATION_SIZE_Y < 0 || input_y + kh*DILATION_SIZE_Y >= INPUT0_SIZE_Y;
+    for (int kd = 0; kd < FILTER_SIZE_Z; ++kd) {
+        bool z_cross_fm = input_z + kd*DILATION_SIZE_Z < 0 || input_z + kd*DILATION_SIZE_Z >= INPUT0_SIZE_Z;
 #if !ASYMMETRIC_DATA_QUANTIZATION
-        if (y_cross_fm)
+        if (z_cross_fm)
             continue;
 #endif
-        PACKED_IN_TYPE line_cache[INPUT_LINE_SIZE];
-        for (int xb = 0; xb < INPUT_LINE_SIZE; xb++) {
-            line_cache[xb] = slm[kh*SLM_LINE_SIZE + sg*OUTPUT_X_BLOCK_SIZE*STRIDE_SIZE_X + xb];
-        }
+        __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
+        for (int kh = 0; kh < FILTER_SIZE_Y ; ++kh) {
+            bool y_cross_fm = input_y + kh*DILATION_SIZE_Y < 0 || input_y + kh*DILATION_SIZE_Y >= INPUT0_SIZE_Y;
+#if !ASYMMETRIC_DATA_QUANTIZATION
+            if (y_cross_fm)
+                continue;
+#endif
+            PACKED_IN_TYPE line_cache[INPUT_LINE_SIZE];
+            for (int xb = 0; xb < INPUT_LINE_SIZE; xb++) {
+                line_cache[xb] = slm[kd*SLM_LINE_SIZE*FILTER_SIZE_Y + kh*SLM_LINE_SIZE + sg*OUTPUT_X_BLOCK_SIZE*STRIDE_SIZE_X + xb];
+            }
 
-        __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
-        for (uint kw = 0; kw < FILTER_SIZE_X ; ++kw) {
-            const uint f_off = filter_idx
-                             + kh * OSV * ISV * FILTER_SIZE_X
-                             + kw * OSV * ISV;
+            __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
+            for (uint kw = 0; kw < FILTER_SIZE_X ; ++kw) {
+                const uint f_off = filter_idx
+                                + kd * OSV * ISV * FILTER_SIZE_X * FILTER_SIZE_Y
+                                + kh * OSV * ISV * FILTER_SIZE_X
+                                + kw * OSV * ISV;
 
-            int weights_data0 = as_int(intel_sub_group_block_read((const __global uint*)(weights + f_off)));
-            int weights_data1 = as_int(intel_sub_group_block_read((const __global uint*)(weights + f_off + SUB_GROUP_SIZE*ISV)));
+                int weights_data0 = as_int(intel_sub_group_block_read((const __global uint*)(weights + f_off)));
+                int weights_data1 = as_int(intel_sub_group_block_read((const __global uint*)(weights + f_off + SUB_GROUP_SIZE*ISV)));
 
-            PACKED_TYPE_VEC src;
+                PACKED_TYPE_VEC src;
 
-            __attribute__((opencl_unroll_hint(OUTPUT_X_BLOCK_SIZE)))
-            for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
-                // src[i] = slm[kh*SLM_LINE_SIZE + (sg*OUTPUT_X_BLOCK_SIZE + i)*STRIDE_SIZE_X + kw*DILATION_SIZE_X];
-                src[i] = line_cache[kw*DILATION_SIZE_X + STRIDE_SIZE_X*i];
-                acc[0][i] = IMAD(acc[0][i], AS_INPUT0_TYPE_4(src[i]), as_char4(weights_data0));
-                acc[1][i] = IMAD(acc[1][i], AS_INPUT0_TYPE_4(src[i]), as_char4(weights_data1));
+                __attribute__((opencl_unroll_hint(OUTPUT_X_BLOCK_SIZE)))
+                for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
+                    // src[i] = slm[kh*SLM_LINE_SIZE + (sg*OUTPUT_X_BLOCK_SIZE + i)*STRIDE_SIZE_X + kw*DILATION_SIZE_X];
+                    src[i] = line_cache[kw*DILATION_SIZE_X + STRIDE_SIZE_X*i];
+                    acc[0][i] = IMAD(acc[0][i], AS_INPUT0_TYPE_4(src[i]), as_char4(weights_data0));
+                    acc[1][i] = IMAD(acc[1][i], AS_INPUT0_TYPE_4(src[i]), as_char4(weights_data1));
 
 #if ASYMMETRIC_WEIGHTS_QUANTIZATION
-                acc_assym_weights[i] = IMAD(acc_assym_weights[i], AS_INPUT0_TYPE_4(src[i]), multiplier);
+                    acc_assym_weights[i] = IMAD(acc_assym_weights[i], AS_INPUT0_TYPE_4(src[i]), multiplier);
 #endif
+                }
             }
         }
     }
@@ -303,13 +325,21 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
     for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
 #if OUTPUT_FEATURE_NUM > 16
         for (int ofm = 0; ofm < 2; ofm++) {
+#if OUTPUT_DIMS == 4
             const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV + SUB_GROUP_SIZE*ofm + lid, y, x+i);
+#elif OUTPUT_DIMS == 5
+            const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV + SUB_GROUP_SIZE*ofm + lid, z, y, x+i);
+#endif
             if (x + i < OUTPUT_SIZE_X && fg*OSV + SUB_GROUP_SIZE*ofm + lid < OUTPUT_FEATURE_NUM) {
                 output[dst_index] = dst[ofm][i];
             }
         }
 #else // OUTPUT_FEATURE_NUM > 16
+#if OUTPUT_DIMS == 4
         const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV + lid, y, x+i);
+#elif OUTPUT_DIMS == 5
+        const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV + lid, z, y, x+i);
+#endif
         if (x + i < OUTPUT_SIZE_X && fg*OSV + lid < OUTPUT_FEATURE_NUM) {
             output[dst_index] = dst[0][i];
         }
@@ -382,8 +412,13 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
     }
 #else // OUTPUT_LAYOUT_B_FS_YX_FSV32
     if (full_x && full_f) {
+#if OUTPUT_DIMS == 4
         const uint dst_index0 = OUTPUT_GET_INDEX(b, fg*OSV, y, x);
         const uint dst_index1 = OUTPUT_GET_INDEX(b, fg*OSV+16, y, x);
+#elif OUTPUT_DIMS == 5
+        const uint dst_index0 = OUTPUT_GET_INDEX(b, fg*OSV, z, y, x);
+        const uint dst_index1 = OUTPUT_GET_INDEX(b, fg*OSV+16, z, y, x);
+#endif
         BLOCK_WRITE(output + dst_index0, dst[0]);
         BLOCK_WRITE(output + dst_index1, dst[1]);
     } else {
@@ -392,7 +427,11 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
                 const bool full_it_x = OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0 || x + i < OUTPUT_SIZE_X;
                 const bool full_sgl_f = OUTPUT_FEATURE_NUM % OSV == 0 || 16*ofm + lid < OUTPUT_FEATURE_NUM % OSV;
                 if (full_it_x && full_sgl_f) {
+#if OUTPUT_DIMS == 4
                     const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV + 16*ofm + lid, y, x+i);
+#elif OUTPUT_DIMS == 5
+                    const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV + 16*ofm + lid, z, y, x+i);
+#endif
                     output[dst_index] = dst[ofm][i];
                 }
             }
index a6af7a2..8062842 100644 (file)
@@ -902,24 +902,38 @@ inline uint FUNC(get_g_os_is_yx_osv16_isv4)(uint g, uint o, uint i, uint y, uint
 }
 
 #define GET_FILTER_OS_IS_YX_OSV16_ISV4_INDEX(prefix, o, i, y, x) \
-    FUNC_CALL(get_os_is_yx_osv_isv4)(                            \
-        o, i, y, x,                                              \
+    FUNC_CALL(get_os_is_zyx_osv_isv4)(                           \
+        o, i, 0, y, x,                                           \
         CAT(prefix, _IFM_PITCH),                                 \
         CAT(prefix, _OFM_PITCH),                                 \
-        CAT(prefix, _SIZE_X), 16)
+        CAT(prefix, _SIZE_X),                                    \
+        CAT(prefix, _SIZE_Y),                                    \
+        16)
 
 #define GET_FILTER_OS_IS_YX_OSV32_ISV4_INDEX(prefix, o, i, y, x) \
-    FUNC_CALL(get_os_is_yx_osv_isv4)(                            \
-        o, i, y, x,                                              \
+    FUNC_CALL(get_os_is_zyx_osv_isv4)(                           \
+        o, i, 0, y, x,                                           \
         CAT(prefix, _IFM_PITCH),                                 \
         CAT(prefix, _OFM_PITCH),                                 \
-        CAT(prefix, _SIZE_X), 32)
+        CAT(prefix, _SIZE_X),                                    \
+        CAT(prefix, _SIZE_Y),                                    \
+        32)
+
+#define GET_FILTER_OS_IS_ZYX_OSV32_ISV4_INDEX(prefix, o, i, z, y, x) \
+    FUNC_CALL(get_os_is_zyx_osv_isv4)(                               \
+        o, i, z, y, x,                                               \
+        CAT(prefix, _IFM_PITCH),                                     \
+        CAT(prefix, _OFM_PITCH),                                     \
+        CAT(prefix, _SIZE_X),                                        \
+        CAT(prefix, _SIZE_Y),                                        \
+        32)
 
-inline uint FUNC(get_os_is_yx_osv_isv4)(uint o, uint i, uint y, uint x,
-                                        uint i_size,
-                                        uint o_size,
-                                        uint x_size,
-                                        uint otd)
+inline uint FUNC(get_os_is_zyx_osv_isv4)(uint o, uint i, uint z, uint y, uint x,
+                                         uint i_size,
+                                         uint o_size,
+                                         uint x_size,
+                                         uint y_size,
+                                         uint otd)
 {
     uint out_depth_tile = o / otd;
     uint od             = o - out_depth_tile * otd;
@@ -930,6 +944,7 @@ inline uint FUNC(get_os_is_yx_osv_isv4)(uint o, uint i, uint y, uint x,
 
     uint idx = out_depth_tile * (o_size / tile) * otd * tile
                + id_tile               * i_size * otd * tile
+               + z            * y_size * x_size * otd * tile
                + y                     * x_size * otd * tile
                + x                              * otd * tile
                + od                                   * tile
index b5619c0..66240e1 100644 (file)
@@ -167,6 +167,8 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint
     return GET_FILTER_OS_IS_YX_OSV32_ISV4_SWIZZLED_BY_2_INDEX(OUTPUT, o, i, y, x);
 #elif defined OUTPUT_LAYOUT_OS_IS_YX_OSV32_ISV4
     return GET_FILTER_OS_IS_YX_OSV32_ISV4_INDEX(OUTPUT, o, i, y, x);
+#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSV32_ISV4
+    return GET_FILTER_OS_IS_ZYX_OSV32_ISV4_INDEX(OUTPUT, o, i, z, y, x);
 #elif defined OUTPUT_LAYOUT_OS_IS_YX_ISA8_OSV8_ISV4_SWIZZLED_BY_4
     return GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_SWIZZLED_BY_4_INDEX(OUTPUT, o, i, y, x);
 #elif defined OUTPUT_LAYOUT_OS_IS_YX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4
index 1376433..e9f97ae 100644 (file)
@@ -336,6 +336,7 @@ std::string toString(WeightsLayout layout) {
         case WeightsLayout::os_is_yx_osv16_isv4:                         return "OS_IS_YX_OSV16_ISV4";
         case WeightsLayout::os_is_yx_osv32_isv4_swizzled_by_2:           return "OS_IS_YX_OSV32_ISV4_SWIZZLED_BY_2";
         case WeightsLayout::os_is_yx_osv32_isv4:                         return "OS_IS_YX_OSV32_ISV4";
+        case WeightsLayout::os_is_zyx_osv32_isv4:                        return "OS_IS_ZYX_OSV32_ISV4";
         case WeightsLayout::os_is_y_x8_osv8_isv4_swizzled_by_4:          return "OS_IS_Y_X8_OSV8_ISV4_SWIZZLED_BY_4";
         case WeightsLayout::os_is_yx_osv32_isv32p:                       return "OS_IS_YX_OSV32_ISV32P";
         case WeightsLayout::oizyx:                                       return "OIZYX";
index 98a6e35..ac9cc42 100644 (file)
@@ -85,8 +85,9 @@ void remove_redundant_reorders::run(program_impl& p) {
                 continue;
 
             auto output_padded = static_cast<bool>(output_layout.data_padding);
-            auto can_omit_padding = (output_layout.format == format::b_fs_yx_fsv16 || output_layout.format == format::b_fs_yx_fsv32) &&
-                                    (input.get_output_layout().format == format::bfyx || input.get_output_layout().format == format::b_fs_yx_fsv4);
+            auto can_omit_padding = ((output_layout.format == format::b_fs_yx_fsv16 || output_layout.format == format::b_fs_yx_fsv32) &&
+                                    (input.get_output_layout().format == format::bfyx || input.get_output_layout().format == format::b_fs_yx_fsv4)) ||
+                                    (output_layout.format == format::b_fs_zyx_fsv16 && input.get_output_layout().format == format::bfzyx);
 
             if (output_padded && !can_omit_padding) {
                 if (input.get_users().size() != 1)
index 561fae4..3ae6996 100644 (file)
@@ -156,6 +156,8 @@ inline std::string fmt_to_str(format fmt) {
             return "os_is_yx_osv32_isv4_swizzled_by_2";
         case format::os_is_yx_osv32_isv4:
             return "os_is_yx_osv32_isv4";
+        case format::os_is_zyx_osv32_isv4:
+            return "os_is_zyx_osv32_isv4";
         case format::os_is_y_x8_osv8_isv4:
             return "os_is_y_x8_osv8_isv4";
         case format::os_is_yx_osv32_isv32p:
index c4f9daf..fe0703a 100644 (file)
@@ -256,6 +256,8 @@ kernel_selector::weights_layout to_weights_layout(format f) {
             return kernel_selector::weights_layout::os_is_yx_osv32_isv4_swizzled_by_2;
         case format::os_is_yx_osv32_isv4:
             return kernel_selector::weights_layout::os_is_yx_osv32_isv4;
+        case format::os_is_zyx_osv32_isv4:
+            return kernel_selector::weights_layout::os_is_zyx_osv32_isv4;
         case format::os_is_yx_osv32_isv32p:
             return kernel_selector::weights_layout::os_is_yx_osv32_isv32p;
         case format::os_is_yx_isv16_osv16:
@@ -394,6 +396,8 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
             return format::os_is_yx_osv32_isv4_swizzled_by_2;
         case kernel_selector::weights_layout::os_is_yx_osv32_isv4:
             return format::os_is_yx_osv32_isv4;
+        case kernel_selector::weights_layout::os_is_zyx_osv32_isv4:
+            return format::os_is_zyx_osv32_isv4;
         case kernel_selector::weights_layout::os_is_y_x8_osv8_isv4_swizzled_by_4:
             return cldnn::format::os_is_y_x8_osv8_isv4_swizzled_by_4;
         case kernel_selector::weights_layout::os_is_yx_osv32_isv32p:
index ac86072..5cc8c4b 100644 (file)
@@ -7205,7 +7205,9 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
                             TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
                             TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 256, 2, 1, 2, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
                             TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 512, 2, 1, 3, 16, 1, 1, format::b_fs_zyx_fsv16, ""),
-                            TestParamType_grouped_convolution_gpu(1, 3, 1, 18, 2, 1, 3, 1, 2, 1, 1, format::b_fs_zyx_fsv16, "")
+                            TestParamType_grouped_convolution_gpu(1, 3, 1, 18, 2, 1, 3, 1, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(2, 3, 4, 3, 18, 3, 3, 3, 1, 1, 1, format::b_fs_zyx_fsv16, "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32"),
+                            TestParamType_grouped_convolution_gpu(79, 224, 224, 3, 64, 3, 3, 3, 1, 2, 1, format::b_fs_zyx_fsv16, "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32")
                         ),
                         convolution_grouped_gpu::PrintToStringParamName);
 
index 1cb500e..053460e 100644 (file)
@@ -303,7 +303,7 @@ public:
     }
 
     layout get_bias_layout(T& p) {
-        return layout{ p.default_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} };
+        return layout{ p.default_type, format::bfyx, tensor{1, p.out_shape.feature[0], 1, 1} };
     }
 
     layout get_weights_zp_layout(T& p) {
@@ -491,12 +491,14 @@ public:
 #define CASE_CONV3D_U8S8_1 {1, 15, 5, 4, 5}, {1, 30, 3, 2, 3}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
 #define CASE_CONV3D_U8S8_2 {1, 15, 5, 5, 5}, {1, 30, 3, 3, 3}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
 #define CASE_CONV3D_U8S8_3 {1, 16, 5, 4, 5}, {1, 32, 5, 4, 5}, {1, 1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_CONV3D_U8S8_4 {1, 17, 5, 4, 5}, {1, 17, 5, 4, 5}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, -1}, tensor{1}, 17, data_types::u8, format::bfzyx, data_types::i8, format::goizyx, data_types::f32, format::bfzyx
+#define CASE_CONV3D_U8S8_4 {1, 17, 5, 4, 5}, {1, 17, 5, 4, 5}, {1, 1, 3, 3, 3}, tensor{1}, tensor{{0, 0, -1, -1, -1}, 0}, tensor{1}, 17, data_types::u8, format::bfzyx, data_types::i8, format::goizyx, data_types::f32, format::bfzyx
+#define CASE_CONV3D_U8S8_5 {1, 3, 5, 4, 5},  {1, 32, 5, 4, 5}, {1, 1, 3, 3, 3}, tensor{1}, tensor{{0, 0, -1, -1, -1}, 0}, tensor{1}, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
 
 #define CASE_CONV3D_S8S8_1 {1, 15, 5, 4, 5}, {1, 30, 3, 2, 3}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::i8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
 #define CASE_CONV3D_S8S8_2 {1, 15, 5, 5, 5}, {1, 30, 3, 3, 3}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::i8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
 #define CASE_CONV3D_S8S8_3 {1, 16, 5, 4, 5}, {1, 32, 5, 4, 5}, {1, 1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::i8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
 #define CASE_CONV3D_S8S8_4 {1, 17, 5, 4, 5}, {1, 17, 5, 4, 5}, {1, 1, 3, 3, 3}, tensor{1}, tensor{{0, 0, -1, -1, -1}, 0}, tensor{1}, 17, data_types::i8, format::bfzyx, data_types::i8, format::goizyx, data_types::f32, format::bfzyx
+#define CASE_CONV3D_S8S8_5 {1, 3, 5, 4, 5},  {1, 18, 5, 4, 5}, {1, 1, 3, 3, 3}, tensor{1}, tensor{{0, 0, -1, -1, -1}, 0}, tensor{1}, 1, data_types::i8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
 
 // in_shape; out_shape; eltw_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format;
 #define CASE_CONV_ELTW_FP32_1 {1, 16, 4, 5}, {1, 32, 2, 3}, {1, 32, 1, 1}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::oiyx, data_types::f32, format::bfyx
@@ -1348,10 +1350,12 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale,
                                 bc_test_params{CASE_CONV3D_U8S8_2, 2, 3},
                                 bc_test_params{CASE_CONV3D_U8S8_3, 2, 3},
                                 bc_test_params{CASE_CONV3D_U8S8_4, 2, 3},
+                                bc_test_params{CASE_CONV3D_U8S8_5, 2, 3},
                                 bc_test_params{CASE_CONV3D_S8S8_1, 2, 3},
                                 bc_test_params{CASE_CONV3D_S8S8_2, 2, 3},
                                 bc_test_params{CASE_CONV3D_S8S8_3, 2, 3},
                                 bc_test_params{CASE_CONV3D_S8S8_4, 2, 3},
+                                bc_test_params{CASE_CONV3D_S8S8_5, 2, 3},
                         }), );
 
 class conv_int8_scale_shift_swish : public ConvFusingTest {};
@@ -1389,10 +1393,12 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_shift_swish,
                                 bc_test_params{CASE_CONV3D_U8S8_2, 2, 6},
                                 bc_test_params{CASE_CONV3D_U8S8_3, 2, 6},
                                 bc_test_params{CASE_CONV3D_U8S8_4, 2, 6},
+                                bc_test_params{CASE_CONV3D_U8S8_5, 2, 6},
                                 bc_test_params{CASE_CONV3D_S8S8_1, 2, 6},
                                 bc_test_params{CASE_CONV3D_S8S8_2, 2, 6},
                                 bc_test_params{CASE_CONV3D_S8S8_3, 2, 6},
                                 bc_test_params{CASE_CONV3D_S8S8_4, 2, 6},
+                                bc_test_params{CASE_CONV3D_S8S8_5, 2, 6},
                         }), );
 
 class conv_int8_prelu_eltwise : public ConvFusingTest {};
@@ -1457,10 +1463,12 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_prelu_eltwise,
                                 bc_test_params{CASE_CONV3D_U8S8_2, 2, 4},
                                 bc_test_params{CASE_CONV3D_U8S8_3, 2, 4},
                                 bc_test_params{CASE_CONV3D_U8S8_4, 2, 4},
+                                bc_test_params{CASE_CONV3D_U8S8_5, 2, 4},
                                 bc_test_params{CASE_CONV3D_S8S8_1, 2, 4},
                                 bc_test_params{CASE_CONV3D_S8S8_2, 2, 4},
                                 bc_test_params{CASE_CONV3D_S8S8_3, 2, 4},
                                 bc_test_params{CASE_CONV3D_S8S8_4, 2, 4},
+                                bc_test_params{CASE_CONV3D_S8S8_5, 2, 4},
                         }), );
 
 class conv_int8_activation_eltwise_quantize : public ConvFusingTest {};
@@ -1656,10 +1664,12 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_quantize_u8,
                                 bc_test_params{CASE_CONV3D_U8S8_2, 2, 3},
                                 bc_test_params{CASE_CONV3D_U8S8_3, 2, 3},
                                 bc_test_params{CASE_CONV3D_U8S8_4, 2, 3},
+                                bc_test_params{CASE_CONV3D_U8S8_5, 2, 3},
                                 bc_test_params{CASE_CONV3D_S8S8_1, 2, 3},
                                 bc_test_params{CASE_CONV3D_S8S8_2, 2, 3},
                                 bc_test_params{CASE_CONV3D_S8S8_3, 2, 3},
                                 bc_test_params{CASE_CONV3D_S8S8_4, 2, 3},
+                                bc_test_params{CASE_CONV3D_S8S8_5, 2, 3},
                         }), );
 
 class conv_int8_scale_quantize_i8 : public ConvFusingTest {};
@@ -1704,10 +1714,12 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_quantize_i8,
                                 bc_test_params{CASE_CONV3D_U8S8_2, 2, 4},
                                 bc_test_params{CASE_CONV3D_U8S8_3, 2, 4},
                                 bc_test_params{CASE_CONV3D_U8S8_4, 2, 4},
+                                bc_test_params{CASE_CONV3D_U8S8_5, 2, 4},
                                 bc_test_params{CASE_CONV3D_S8S8_1, 2, 4},
                                 bc_test_params{CASE_CONV3D_S8S8_2, 2, 4},
                                 bc_test_params{CASE_CONV3D_S8S8_3, 2, 4},
                                 bc_test_params{CASE_CONV3D_S8S8_4, 2, 4},
+                                bc_test_params{CASE_CONV3D_S8S8_5, 2, 4},
                         }), );
 
 class conv_int8_scale_quantize_i8_conv_b_fs_yx_fsv4_int8 : public ConvFusingTest {};
@@ -1793,10 +1805,12 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_relu_quantize,
                                 bc_test_params{CASE_CONV3D_U8S8_2, 2, 4},
                                 bc_test_params{CASE_CONV3D_U8S8_3, 2, 4},
                                 bc_test_params{CASE_CONV3D_U8S8_4, 2, 4},
+                                bc_test_params{CASE_CONV3D_U8S8_5, 2, 4},
                                 bc_test_params{CASE_CONV3D_S8S8_1, 2, 4},
                                 bc_test_params{CASE_CONV3D_S8S8_2, 2, 4},
                                 bc_test_params{CASE_CONV3D_S8S8_3, 2, 4},
                                 bc_test_params{CASE_CONV3D_S8S8_4, 2, 4},
+                                bc_test_params{CASE_CONV3D_S8S8_5, 2, 4},
                         }), );
 
 class conv_int8_scale_activation_quantize_i8 : public ConvFusingTest {};
@@ -1836,10 +1850,12 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_activation_quantize_i8,
                                 bc_test_params{CASE_CONV3D_U8S8_2, 2, 5},
                                 bc_test_params{CASE_CONV3D_U8S8_3, 2, 5},
                                 bc_test_params{CASE_CONV3D_U8S8_4, 2, 5},
+                                bc_test_params{CASE_CONV3D_U8S8_5, 2, 5},
                                 bc_test_params{CASE_CONV3D_S8S8_1, 2, 5},
                                 bc_test_params{CASE_CONV3D_S8S8_2, 2, 5},
                                 bc_test_params{CASE_CONV3D_S8S8_3, 2, 5},
                                 bc_test_params{CASE_CONV3D_S8S8_4, 2, 5},
+                                bc_test_params{CASE_CONV3D_S8S8_5, 2, 5},
                         }), );
 
 class conv_int8_scale_activation_quantize_i8_eltwise_fp32 : public ConvFusingTest {};
@@ -1880,10 +1896,12 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_activation_quantize_i8_eltw
                                 bc_test_params{CASE_CONV3D_U8S8_2, 2, 6},
                                 bc_test_params{CASE_CONV3D_U8S8_3, 2, 6},
                                 bc_test_params{CASE_CONV3D_U8S8_4, 2, 6},
+                                bc_test_params{CASE_CONV3D_U8S8_5, 2, 6},
                                 bc_test_params{CASE_CONV3D_S8S8_1, 2, 6},
                                 bc_test_params{CASE_CONV3D_S8S8_2, 2, 6},
                                 bc_test_params{CASE_CONV3D_S8S8_3, 2, 6},
                                 bc_test_params{CASE_CONV3D_S8S8_4, 2, 6},
+                                bc_test_params{CASE_CONV3D_S8S8_5, 2, 6},
                         }), );
 
 class conv_int8_scale_activation_quantize_i8_activation : public ConvFusingTest {};
@@ -1924,10 +1942,12 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_activation_quantize_i8_acti
                                 bc_test_params{CASE_CONV3D_U8S8_2, 2, 6},
                                 bc_test_params{CASE_CONV3D_U8S8_3, 2, 6},
                                 bc_test_params{CASE_CONV3D_U8S8_4, 2, 6},
+                                bc_test_params{CASE_CONV3D_U8S8_5, 2, 6},
                                 bc_test_params{CASE_CONV3D_S8S8_1, 2, 6},
                                 bc_test_params{CASE_CONV3D_S8S8_2, 2, 6},
                                 bc_test_params{CASE_CONV3D_S8S8_3, 2, 6},
                                 bc_test_params{CASE_CONV3D_S8S8_4, 2, 6},
+                                bc_test_params{CASE_CONV3D_S8S8_5, 2, 6},
                         }), );
 
 
@@ -1974,10 +1994,12 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_activation_quantize_i8_eltw
                                 bc_test_params{CASE_CONV3D_U8S8_2, 2, 7},
                                 bc_test_params{CASE_CONV3D_U8S8_3, 2, 7},
                                 bc_test_params{CASE_CONV3D_U8S8_4, 2, 7},
+                                bc_test_params{CASE_CONV3D_U8S8_5, 2, 7},
                                 bc_test_params{CASE_CONV3D_S8S8_1, 2, 7},
                                 bc_test_params{CASE_CONV3D_S8S8_2, 2, 7},
                                 bc_test_params{CASE_CONV3D_S8S8_3, 2, 7},
                                 bc_test_params{CASE_CONV3D_S8S8_4, 2, 7},
+                                bc_test_params{CASE_CONV3D_S8S8_5, 2, 7},
                         }), );
 
 class conv_int8_scale_prelu_quantize_i8_eltwise_fp32_quantize_i8_vec : public ConvFusingTest {};
@@ -2181,10 +2203,12 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_asymmetric_data,
                                 bc_test_params{CASE_CONV3D_U8S8_2, 2, 3},
                                 bc_test_params{CASE_CONV3D_U8S8_3, 2, 3},
                                 bc_test_params{CASE_CONV3D_U8S8_4, 2, 3},
+                                bc_test_params{CASE_CONV3D_U8S8_5, 2, 3},
                                 bc_test_params{CASE_CONV3D_S8S8_1, 2, 3},
                                 bc_test_params{CASE_CONV3D_S8S8_2, 2, 3},
                                 bc_test_params{CASE_CONV3D_S8S8_3, 2, 3},
                                 bc_test_params{CASE_CONV3D_S8S8_4, 2, 3},
+                                bc_test_params{CASE_CONV3D_S8S8_5, 2, 3},
                         }), );
 
 class conv_int8_asymmetric_data_and_weights : public ConvFusingTest {};
@@ -2250,10 +2274,12 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_asymmetric_data_and_weights,
                                 bc_test_params{CASE_CONV3D_U8S8_2, 2, 3},
                                 bc_test_params{CASE_CONV3D_U8S8_3, 2, 3},
                                 bc_test_params{CASE_CONV3D_U8S8_4, 2, 3},
+                                bc_test_params{CASE_CONV3D_U8S8_5, 2, 3},
                                 bc_test_params{CASE_CONV3D_S8S8_1, 2, 3},
                                 bc_test_params{CASE_CONV3D_S8S8_2, 2, 3},
                                 bc_test_params{CASE_CONV3D_S8S8_3, 2, 3},
                                 bc_test_params{CASE_CONV3D_S8S8_4, 2, 3},
+                                bc_test_params{CASE_CONV3D_S8S8_5, 2, 3},
                         }), );