[IE CLDNN] Loosen restrictions on channels in 3d fsv16 convolution kernel (#1744)
authorLukasz Debski <Lukasz.Debski@intel.com>
Tue, 1 Sep 2020 12:17:03 +0000 (14:17 +0200)
committerGitHub <noreply@github.com>
Tue, 1 Sep 2020 12:17:03 +0000 (15:17 +0300)
inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/convolution.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_zyx_fsv16.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_zyx_fsv16.h
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f16.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f32.cl
inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp

index 05d1c8c..df7b9b6 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2019-2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -26,7 +26,7 @@ const std::vector<std::vector<ptrdiff_t>> padEnds = {{0, 0},
                                                      {0, 3}};
 const std::vector<std::vector<size_t >> dilations = {{1, 1},
                                                             {3, 1}};
-const std::vector<size_t> numOutCannels = {1, 5};
+const std::vector<size_t> numOutChannels = {1, 5};
 const std::vector<ngraph::op::PadType> padTypes = {
         ngraph::op::PadType::EXPLICIT,
         ngraph::op::PadType::VALID
@@ -37,7 +37,7 @@ const auto conv2DParams_ExplicitPadding = ::testing::Combine(
         ::testing::ValuesIn(padBegins),
         ::testing::ValuesIn(padEnds),
         ::testing::ValuesIn(dilations),
-        ::testing::ValuesIn(numOutCannels),
+        ::testing::ValuesIn(numOutChannels),
         ::testing::Values(ngraph::op::PadType::EXPLICIT)
 );
 const auto conv2DParams_AutoPadValid = ::testing::Combine(
@@ -46,7 +46,7 @@ const auto conv2DParams_AutoPadValid = ::testing::Combine(
         ::testing::Values(std::vector<ptrdiff_t>({0, 0})),
         ::testing::Values(std::vector<ptrdiff_t>({0, 0})),
         ::testing::ValuesIn(dilations),
-        ::testing::ValuesIn(numOutCannels),
+        ::testing::ValuesIn(numOutChannels),
         ::testing::Values(ngraph::op::PadType::VALID)
 );
 
@@ -75,8 +75,9 @@ const std::vector<std::vector<ptrdiff_t>> paddings3d = {{0, 0, 0},
 const std::vector<std::vector<size_t >> strides3d = {{1, 1, 1},
                                                             {1, 2, 1}};
 
-const std::vector<std::vector<size_t >> dilations3d = {{1, 1, 1},
-                                                              {1, 2, 1}};
+const std::vector<std::vector<size_t >> dilations3d = { {1, 1, 1} };
+
+const std::vector<size_t > numOutChannels3d = {1, 5, 16};
 
 const auto conv3DParams = ::testing::Combine(
         ::testing::ValuesIn(kernels3d),
@@ -84,11 +85,11 @@ const auto conv3DParams = ::testing::Combine(
         ::testing::ValuesIn(paddings3d),
         ::testing::ValuesIn(paddings3d),
         ::testing::ValuesIn(dilations3d),
-        ::testing::Values(5),
+        ::testing::ValuesIn(numOutChannels3d),
         ::testing::Values(ngraph::op::PadType::EXPLICIT)
 );
 
-INSTANTIATE_TEST_CASE_P(Convolution3D, ConvolutionLayerTest,
+INSTANTIATE_TEST_CASE_P(Convolution3D_Basic1, ConvolutionLayerTest,
                         ::testing::Combine(
                                 conv3DParams,
                                 ::testing::ValuesIn(netPrecisions),
index 7562696..4011302 100644 (file)
@@ -47,19 +47,29 @@ FusedOpsConfiguration GenerateFusedOpsConfiguration_f16(size_t conf_id, std::str
 }
 
 FusedOpsConfiguration GenerateFusedOpsConfiguration_bsv16_fsv16(size_t conf_id, std::string input_name, Datatype dt,
-                                                                size_t dims) {
+                                                                size_t dims, bool is_vector) {
+    std::string suffix = (is_vector ? "_VEC" : "_SCALAR") + std::to_string(conf_id);
+    std::string input_var_name = input_name + std::to_string(conf_id) + (is_vector ? "" : "[i]");
+    size_t vec_size = is_vector ? 8 : 1;
     std::vector<std::string> idx_order;
-    if (dims == 5)
-        idx_order = {"(mb + " + std::to_string(conf_id * 8) + ")", "(oc*16)", "od", "oh", "ow"};
-    else
-        idx_order = {"(mb + " + std::to_string(conf_id * 8) + ")", "(oc*16)", "oh", "ow"};
+    if (is_vector) {
+        if (dims == 5)
+            idx_order = {"(mb + " + std::to_string(conf_id * 8) + ")", "(oc*16)", "od", "oh", "ow"};
+        else
+            idx_order = {"(mb + " + std::to_string(conf_id * 8) + ")", "(oc*16)", "oh", "ow"};
+    } else {
+        if (dims == 5)
+            idx_order = {"(mb + " + std::to_string(conf_id * 8) + ")", "(oc*16 + local_id)", "od", "oh", "(ow + i)"};
+        else
+            idx_order = {"(mb + " + std::to_string(conf_id * 8) + ")", "(oc*16 + local_id)", "oh", "(ow + i)"};
+    }
 
-    return { "_VEC" + std::to_string(conf_id),
+    return { suffix,
              idx_order,
-             input_name + std::to_string(conf_id),
+             input_var_name,
              dt,
-             8,
-             FusedOpsConfiguration::LoadType::LT_ALIGNED_READ,
+             vec_size,
+             is_vector ? FusedOpsConfiguration::LoadType::LT_ALIGNED_READ : FusedOpsConfiguration::LoadType::LT_UNALIGNED,
              FusedOpsConfiguration::BoundaryCheck::ENABLED,
              FusedOpsConfiguration::IndexType::TENSOR_COORD,
              Tensor::DataChannelName::BATCH };
@@ -106,7 +116,7 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_zyx_fsv16::SetDefault
     auto b = out.Batch().v;
     auto g = params.groups;
 
-    const bool is_1stconv = input.Feature().v == 3;
+    const bool is_1stconv = input.Feature().v == 3 && input.GetLayout() == DataLayout::bfzyx;
     const bool ver_16mb16c = !is_1stconv &&
         ((out.GetDType() == Datatype::F16 && b % 32 == 0) ||
         (out.GetDType() == Datatype::F32 && b % 16 == 0));
@@ -140,7 +150,7 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_zyx_fsv16::SetDefault
             kd.gws2 = b * f / ocb;
         }
     } else if (ver_16mb16c) {
-        f = f / g;
+        f = (g > 1) ? f/g : Align(f, 16);
         kd.lws0 = sub_group_size;
         kd.lws1 = 1;
         kd.lws2 = 1;
@@ -152,7 +162,7 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_zyx_fsv16::SetDefault
         kd.cldnnStyle.blockWidth = 1;
     } else {
         auto oh_block = 1;
-        f = (g > 1) ? Align(f / g, 16) : f;
+        f = Align(f / g, 16);
 
         auto div = 16;
         while (div > 1) {
@@ -201,16 +211,14 @@ bool ConvolutionKernel_b_fs_zyx_fsv16::Validate(const Params& p, const optional_
     if (output.GetDType() != use_data_type)
         return false;
 
-    if (output.Feature().v % feature_block_size != 0)
-        return false;
-
     if (input.GetLayout() == DataLayout::bfzyx) {
-        if (input.Feature().v != 3)
+        if (input.Feature().v != 3 || output.Feature().v % feature_block_size != 0)
             return false;
         if (output.GetDType() == Datatype::F16 && (output.Feature().v % 32 != 0))
             return false;
     } else {
-        if ((input.Feature().v / params.groups) % feature_block_size != 0 && (input.Feature().v / params.groups) != 8)
+        if ((params.groups > 1) && (input.Feature().v / params.groups) % feature_block_size != 0 &&
+            (input.Feature().v / params.groups) != 8)
             return false;
     }
 
@@ -228,10 +236,9 @@ JitConstants ConvolutionKernel_b_fs_zyx_fsv16::GetJitConstants(const convolution
     auto output = params.output;
     auto jit = Parent::GetJitConstants(params, runInfo);
 
-    const bool is_1stconv = input.Feature().v == 3;
-    const bool ver_16mb16c = !is_1stconv &&
-        ((output.GetDType() == Datatype::F16 && output.Batch().v % 32 == 0) ||
-         (output.GetDType() == Datatype::F32 && output.Batch().v % 16 == 0));
+    const bool is_1stconv = input.Feature().v == 3 && input.GetLayout() == DataLayout::bfzyx;
+    const bool ver_16mb16c = !is_1stconv && ((output.GetDType() == Datatype::F16 && output.Batch().v % 32 == 0) ||
+                                             (output.GetDType() == Datatype::F32 && output.Batch().v % 16 == 0));
 
     if (ver_16mb16c) {
         jit.AddConstant(MakeJitConstant("VER_16MB16C", 1));
@@ -290,15 +297,22 @@ JitConstants ConvolutionKernel_b_fs_zyx_fsv16::GetJitConstants(const convolution
     if (ver_16mb16c && !params.fused_ops.empty()) {
         const auto dims_num = DataTensor::ChannelsCount(input.GetLayout());
         if (output.GetDType() != Datatype::F16) {
-            FusedOpsConfiguration conf_vec0 = GenerateFusedOpsConfiguration_bsv16_fsv16(0, "blockC0", input_dt, dims_num);
-            FusedOpsConfiguration conf_vec1 = GenerateFusedOpsConfiguration_bsv16_fsv16(1, "blockC0", input_dt, dims_num);
-            jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec0, conf_vec1}));
+            FusedOpsConfiguration conf_vec0 = GenerateFusedOpsConfiguration_bsv16_fsv16(0, "blockC0", input_dt, dims_num, true);
+            FusedOpsConfiguration conf_vec1 = GenerateFusedOpsConfiguration_bsv16_fsv16(1, "blockC0", input_dt, dims_num, true);
+            FusedOpsConfiguration conf_scalar0 = GenerateFusedOpsConfiguration_bsv16_fsv16(0, "blockC0", input_dt, dims_num, false);
+            FusedOpsConfiguration conf_scalar1 = GenerateFusedOpsConfiguration_bsv16_fsv16(1, "blockC0", input_dt, dims_num, false);
+            jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec0, conf_vec1, conf_scalar0, conf_scalar1}));
         } else {
-            FusedOpsConfiguration conf_vec0 = GenerateFusedOpsConfiguration_bsv16_fsv16(0, "C0", input_dt, dims_num);
-            FusedOpsConfiguration conf_vec1 = GenerateFusedOpsConfiguration_bsv16_fsv16(1, "C0", input_dt, dims_num);
-            FusedOpsConfiguration conf_vec2 = GenerateFusedOpsConfiguration_bsv16_fsv16(2, "C0", input_dt, dims_num);
-            FusedOpsConfiguration conf_vec3 = GenerateFusedOpsConfiguration_bsv16_fsv16(3, "C0", input_dt, dims_num);
-            jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec0, conf_vec1, conf_vec2, conf_vec3}));
+            FusedOpsConfiguration conf_vec0 = GenerateFusedOpsConfiguration_bsv16_fsv16(0, "C0", input_dt, dims_num, true);
+            FusedOpsConfiguration conf_vec1 = GenerateFusedOpsConfiguration_bsv16_fsv16(1, "C0", input_dt, dims_num, true);
+            FusedOpsConfiguration conf_vec2 = GenerateFusedOpsConfiguration_bsv16_fsv16(2, "C0", input_dt, dims_num, true);
+            FusedOpsConfiguration conf_vec3 = GenerateFusedOpsConfiguration_bsv16_fsv16(3, "C0", input_dt, dims_num, true);
+            FusedOpsConfiguration conf_scalar0 = GenerateFusedOpsConfiguration_bsv16_fsv16(0, "C0", input_dt, dims_num, false);
+            FusedOpsConfiguration conf_scalar1 = GenerateFusedOpsConfiguration_bsv16_fsv16(1, "C0", input_dt, dims_num, false);
+            FusedOpsConfiguration conf_scalar2 = GenerateFusedOpsConfiguration_bsv16_fsv16(2, "C0", input_dt, dims_num, false);
+            FusedOpsConfiguration conf_scalar3 = GenerateFusedOpsConfiguration_bsv16_fsv16(3, "C0", input_dt, dims_num, false);
+            jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec0, conf_vec1, conf_vec2, conf_vec3,
+                                                        conf_scalar0, conf_scalar1, conf_scalar2, conf_scalar3}));
         }
     } else if (!is_1stconv && !params.fused_ops.empty()) {
         FusedOpsConfiguration conf_vec0 = GenerateFusedOpsConfiguration_f16(0, "blockC0", input_dt, true);
@@ -322,12 +336,19 @@ JitConstants ConvolutionKernel_b_fs_zyx_fsv16::GetJitConstants(const convolution
     jit.AddConstant(MakeJitConstant("IS_DW", "DEPTHWISE_SEPARABLE_OPT"));
     jit.AddConstant(MakeJitConstant("WITH_BIAS", "BIAS_TERM"));
 
+    if (is_1stconv || params.groups > 1) {
+        jit.AddConstant(MakeJitConstant("OC", output.Feature().v / params.groups));
+        jit.AddConstant(MakeJitConstant("IC", input.Feature().v / params.groups));
+    } else {
+        jit.AddConstant(MakeJitConstant("OC", Align(output.Feature().v, 16)));
+        jit.AddConstant(MakeJitConstant("IC", Align(input.Feature().v, 16)));
+    }
+
     jit.AddConstant(MakeJitConstant("MB", "OUTPUT_BATCH_NUM"));
-    jit.AddConstant(MakeJitConstant("OC", output.Feature().v / params.groups));
     jit.AddConstant(MakeJitConstant("OD", "OUTPUT_SIZE_Z"));
     jit.AddConstant(MakeJitConstant("OH", "OUTPUT_SIZE_Y"));
     jit.AddConstant(MakeJitConstant("OW", "OUTPUT_SIZE_X"));
-    jit.AddConstant(MakeJitConstant("IC", input.Feature().v / params.groups));
+
     jit.AddConstant(MakeJitConstant("ID", "INPUT0_SIZE_Z"));
     jit.AddConstant(MakeJitConstant("IH", "INPUT0_SIZE_Y"));
     jit.AddConstant(MakeJitConstant("IW", "INPUT0_SIZE_X"));
@@ -344,16 +365,26 @@ JitConstants ConvolutionKernel_b_fs_zyx_fsv16::GetJitConstants(const convolution
     jit.AddConstant(MakeJitConstant("PH_R", "PADDING_SIZE_Y"));
     jit.AddConstant(MakeJitConstant("PW_R", "PADDING_SIZE_X"));
 
-    jit.AddConstant(MakeJitConstant("IC_FULL", params.inputs[0].Feature().LogicalDimPadded()));
+    if (is_1stconv || params.groups > 1) {
+        jit.AddConstant(MakeJitConstant("IC_FULL", params.inputs[0].Feature().LogicalDimPadded()));
+        jit.AddConstant(MakeJitConstant("OC_FULL", params.output.Feature().LogicalDimPadded()));
+    } else {
+        jit.AddConstant(MakeJitConstant("IC_FULL", Align(params.inputs[0].Feature().LogicalDimPadded(), 16)));
+        jit.AddConstant(MakeJitConstant("OC_FULL", Align(params.output.Feature().LogicalDimPadded(), 16)));
+    }
+
     jit.AddConstant(MakeJitConstant("ID_FULL", params.inputs[0].Z().LogicalDimPadded()));
     jit.AddConstant(MakeJitConstant("IH_FULL", params.inputs[0].Y().LogicalDimPadded()));
     jit.AddConstant(MakeJitConstant("IW_FULL", params.inputs[0].X().LogicalDimPadded()));
-
-    jit.AddConstant(MakeJitConstant("OC_FULL", params.output.Feature().LogicalDimPadded()));
     jit.AddConstant(MakeJitConstant("OD_FULL", params.output.Z().LogicalDimPadded()));
     jit.AddConstant(MakeJitConstant("OH_FULL", params.output.Y().LogicalDimPadded()));
     jit.AddConstant(MakeJitConstant("OW_FULL", params.output.X().LogicalDimPadded()));
 
+    if (params.output.Feature().v % feature_block_size != 0) {
+        jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
+        jit.AddConstant(MakeJitConstant("OC_NOTALLIGNED", output.Feature().v));
+    }
+
     return jit;
 }
 
index fc3ddc2..19fa02c 100644 (file)
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ public:
 protected:
     WeightsLayout GetPreferredWeightsLayout(const convolution_params& params) const override {
         bool is_3d_case = params.inputs[0].GetLayout() != DataLayout::bs_fs_yx_bsv16_fsv16;
-        if (params.inputs[0].Feature().v == 3) {
+        if (params.inputs[0].Feature().v == 3 && params.inputs[0].GetLayout() == DataLayout::bfzyx) {
             return WeightsLayout::os_zyxi_osv16;
         } else if (use_data_type == Datatype::F32 && params.inputs[0].Batch().v % 16 == 0) {
             if (is_3d_case)
index 0fd1fb9..f3c3df0 100644 (file)
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019 Intel Corporation
+* Copyright 2019-2020 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -576,6 +576,7 @@ KERNEL(gen9_common_conv_fwd_f16_kernel)(
     const int oc = get_group_id(0);
 #endif
 #endif
+    const int local_id = get_local_id(0);
     const int sp = get_group_id(1);
     int mb = get_group_id(2) * MB_BLOCK * 2;
 
@@ -594,7 +595,6 @@ KERNEL(gen9_common_conv_fwd_f16_kernel)(
     const int ow = ohw % OW;
 
 #if WITH_BIAS
-    const int local_id = get_local_id(0);
     half8 C00 = bias[oc * OC_BLOCK + local_id + g * OC];
     half8 C01 = C00, C02 = C00, C03 = C00;
 #if USE_32OC_UNROLL
@@ -767,21 +767,37 @@ KERNEL(gen9_common_conv_fwd_f16_kernel)(
 #endif
 #endif
 
+#if OUTPUT_LEFTOVERS
+    if ((oc+1)*OC_BLOCK >= OC_NOTALLIGNED) {
+        for (int i = 0; i < 8; i++) {
+
 #if HAS_FUSED_OPS
-    { FUSED_OPS_VEC0; C00 = FUSED_OPS_RESULT_VEC0; }
-    { FUSED_OPS_VEC1; C01 = FUSED_OPS_RESULT_VEC1; }
+            { FUSED_OPS_SCALAR0; C00[i] = FUSED_OPS_RESULT_SCALAR0; }
+            { FUSED_OPS_SCALAR1; C00[i] = FUSED_OPS_RESULT_SCALAR1; }
 #endif
-
-    intel_sub_group_block_write_us8(
+            if (oc * OC_BLOCK + local_id < OC_NOTALLIGNED) {
+                dst_write0[i * OC_BLOCK + local_id] = C00[i];
+                dst_write0[8 * OC_BLOCK + i * OC_BLOCK + local_id] = C01[i];
+            }
+        }
+    } else
+#endif // OUTPUT_LEFTOVERS
+    {
+#if HAS_FUSED_OPS
+        { FUSED_OPS_VEC0; C00 = FUSED_OPS_RESULT_VEC0; }
+        { FUSED_OPS_VEC1; C01 = FUSED_OPS_RESULT_VEC1; }
+#endif
+        intel_sub_group_block_write_us8(
             (__global ushort *)dst_write0, as_ushort8(C00));
-    intel_sub_group_block_write_us8(
+        intel_sub_group_block_write_us8(
             (__global ushort *)&dst_write0[8 * OC_BLOCK], as_ushort8(C01));
 #if USE_32OC_UNROLL
-    intel_sub_group_block_write_us8(
+        intel_sub_group_block_write_us8(
             (__global ushort *)&dst_write1[0], as_ushort8(C10));
-    intel_sub_group_block_write_us8(
+        intel_sub_group_block_write_us8(
             (__global ushort *)&dst_write1[8 * OC_BLOCK], as_ushort8(C11));
 #endif
+    }
 
 #if WITH_SUM == 1
     half8 blockS02 = as_half8(
@@ -823,27 +839,45 @@ KERNEL(gen9_common_conv_fwd_f16_kernel)(
 #endif
 #endif
 
-#if HAS_FUSED_OPS
-    { FUSED_OPS_VEC2; C02 = FUSED_OPS_RESULT_VEC2; }
-    { FUSED_OPS_VEC3; C03 = FUSED_OPS_RESULT_VEC3; }
-#endif
+#if OUTPUT_LEFTOVERS
+    if ((oc+1)*OC_BLOCK >= OC_NOTALLIGNED) {
+        for (int i = 0; i < 8; i++) {
 
-    intel_sub_group_block_write_us8(
-            (__global ushort *)&dst_write0[MB_BLOCK * OC_FULL * ODHW_SIZE],
-            as_ushort8(C02));
-    intel_sub_group_block_write_us8(
-            (__global ushort *)&dst_write0[MB_BLOCK * OC_FULL * ODHW_SIZE
-                    + 8 * OC_BLOCK],
-            as_ushort8(C03));
+#if HAS_FUSED_OPS
+            { FUSED_OPS_SCALAR0; C02[i] = FUSED_OPS_RESULT_SCALAR0; }
+            { FUSED_OPS_SCALAR1; C03[i] = FUSED_OPS_RESULT_SCALAR1; }
+#endif
+            if (oc * OC_BLOCK + local_id < OC_NOTALLIGNED) {
+                dst_write0[MB_BLOCK * OC_FULL * ODHW_SIZE
+                        + i * OC_BLOCK + local_id] = C02[i];
+                dst_write0[MB_BLOCK * OC_FULL * ODHW_SIZE + 8 * OC_BLOCK
+                        + i * OC_BLOCK + local_id] = C03[i];
+            }
+        }
+    } else
+#endif // OUTPUT_LEFTOVERS
+    {
+#if HAS_FUSED_OPS
+        { FUSED_OPS_VEC2; C02 = FUSED_OPS_RESULT_VEC2; }
+        { FUSED_OPS_VEC3; C03 = FUSED_OPS_RESULT_VEC3; }
+#endif
+        intel_sub_group_block_write_us8(
+                (__global ushort *)&dst_write0[MB_BLOCK * OC_FULL * ODHW_SIZE],
+                as_ushort8(C02));
+        intel_sub_group_block_write_us8(
+                (__global ushort *)&dst_write0[MB_BLOCK * OC_FULL * ODHW_SIZE
+                        + 8 * OC_BLOCK],
+                as_ushort8(C03));
 #if USE_32OC_UNROLL
-    intel_sub_group_block_write_us8(
-            (__global ushort *)&dst_write1[MB_BLOCK * OC_FULL * ODHW_SIZE],
-            as_ushort8(C12));
-    intel_sub_group_block_write_us8(
-            (__global ushort *)&dst_write1[MB_BLOCK * OC_FULL * ODHW_SIZE
-                    + 8 * OC_BLOCK],
-            as_ushort8(C13));
+        intel_sub_group_block_write_us8(
+                (__global ushort *)&dst_write1[MB_BLOCK * OC_FULL * ODHW_SIZE],
+                as_ushort8(C12));
+        intel_sub_group_block_write_us8(
+                (__global ushort *)&dst_write1[MB_BLOCK * OC_FULL * ODHW_SIZE
+                        + 8 * OC_BLOCK],
+                as_ushort8(C13));
 #endif
+    }
 #endif
 
 #if VER_8OW16C == 1 && (IC % 16 == 0 || (IC == 8 && G != 1))
@@ -1229,9 +1263,18 @@ KERNEL(gen9_common_conv_fwd_f16_kernel)(
             if (local_id < 8)
                 dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
 #else
-            intel_sub_group_block_write_us(
-                    (__global ushort *)(&dst_write0[i * OC_BLOCK]),
-                    as_ushort(blockC00[i]));
+
+#if OUTPUT_LEFTOVERS
+            if ((oc+1)*OC_BLOCK >= OC_NOTALLIGNED) {
+                if (oc * OC_BLOCK + local_id < OC_NOTALLIGNED)
+                    dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
+            } else
+#endif
+            {
+                intel_sub_group_block_write_us(
+                        (__global ushort *)(&dst_write0[i * OC_BLOCK]),
+                        as_ushort(blockC00[i]));
+            }
 #endif
         }
     } else {
@@ -1249,9 +1292,18 @@ KERNEL(gen9_common_conv_fwd_f16_kernel)(
             if (local_id < 8)
                 dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
 #else
-            intel_sub_group_block_write_us(
-                    (__global ushort *)(&dst_write0[i * OC_BLOCK]),
-                    as_ushort(blockC00[i]));
+
+#if OUTPUT_LEFTOVERS
+            if ((oc+1)*OC_BLOCK >= OC_NOTALLIGNED) {
+                if (oc * OC_BLOCK + local_id < OC_NOTALLIGNED)
+                    dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
+            } else
+#endif
+            {
+                intel_sub_group_block_write_us(
+                        (__global ushort *)(&dst_write0[i * OC_BLOCK]),
+                        as_ushort(blockC00[i]));
+            }
 #endif //  OC == 8 && G != 1
         }
 #else
@@ -1266,12 +1318,26 @@ KERNEL(gen9_common_conv_fwd_f16_kernel)(
             dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
     }
 #else
+#if OUTPUT_LEFTOVERS
+    if ((oc+1)*OC_BLOCK >= OC_NOTALLIGNED) {
+        for (int i = 0; i < 8; i++) {
+
+#if HAS_FUSED_OPS
+            { FUSED_OPS_SCALAR0; blockC00[i] = FUSED_OPS_RESULT_SCALAR0; }
+#endif
+            if (oc * OC_BLOCK + local_id < OC_NOTALLIGNED)
+                dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
+        }
+    } else
+#endif
+    {
 #if HAS_FUSED_OPS
     { FUSED_OPS_VEC0; blockC00 = FUSED_OPS_RESULT_VEC0; }
 #endif
 
-    intel_sub_group_block_write_us8(
-            (__global ushort *)(&dst_write0[0]), as_ushort8(blockC00));
+        intel_sub_group_block_write_us8(
+                (__global ushort *)(&dst_write0[0]), as_ushort8(blockC00));
+    }
 #endif //  OC == 8 && G != 1
 #if OW_BLOCK == 16
 
@@ -1285,6 +1351,20 @@ KERNEL(gen9_common_conv_fwd_f16_kernel)(
             dst_write0[(i + 8) * OC_BLOCK + local_id] = blockC01[i];
     }
 #else
+
+#if OUTPUT_LEFTOVERS
+    if ((oc+1)*OC_BLOCK >= OC_NOTALLIGNED) {
+        for (int i = 0; i < 8; i++) {
+
+#if HAS_FUSED_OPS
+            { FUSED_OPS_SCALAR1; blockC01[i] = FUSED_OPS_RESULT_SCALAR1; }
+#endif
+            if (oc * OC_BLOCK + local_id < OC_NOTALLIGNED)
+                dst_write0[(i + 8) * OC_BLOCK + local_id] = blockC01[i];
+        }
+    } else
+#endif
+    {
 #if HAS_FUSED_OPS
     { FUSED_OPS_VEC1; blockC01 = FUSED_OPS_RESULT_VEC1; }
 #endif
@@ -1292,6 +1372,7 @@ KERNEL(gen9_common_conv_fwd_f16_kernel)(
     intel_sub_group_block_write_us8(
             (__global ushort *)(&dst_write0[8 * OC_BLOCK]),
             as_ushort8(blockC01));
+    }
 #endif //  OC == 8 && G != 1
 #endif
 #endif
index b3f717d..f081d84 100644 (file)
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019 Intel Corporation
+* Copyright 2019-2020 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -249,17 +249,35 @@ const float sum_scale = 1;
     DO_ELTWISE(blockC01, 8, eltwise_alpha, eltwise_beta);
 #endif
 
+#if OUTPUT_LEFTOVERS
+    if ((oc+1)*OC_BLOCK >= OC_NOTALLIGNED) {
+        for (int i = 0; i < 8; i++) {
+
 #if HAS_FUSED_OPS
-    { FUSED_OPS_VEC0; blockC00 = FUSED_OPS_RESULT_VEC0; }
-    { FUSED_OPS_VEC1; blockC01 = FUSED_OPS_RESULT_VEC1; }
+            { FUSED_OPS_SCALAR0; blockC00[i] = FUSED_OPS_RESULT_SCALAR0; }
+            { FUSED_OPS_SCALAR1; blockC01[i] = FUSED_OPS_RESULT_SCALAR1; }
 #endif
+            if (oc * OC_BLOCK + local_id < OC_NOTALLIGNED) {
+                dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
+                dst_write0[8 * OC_BLOCK + i * OC_BLOCK + local_id] = blockC01[i];
+            }
+        }
+    } else
 
-    intel_sub_group_block_write8(
-            (__global unsigned int *)(&dst_write0[0]), as_uint8(blockC00));
-    intel_sub_group_block_write8(
-            (__global unsigned int *)(&dst_write0[8 * OC_BLOCK]),
-            as_uint8(blockC01));
+#endif // OUTPUT_LEFTOVERS
+    {
+
+#if HAS_FUSED_OPS
+    { FUSED_OPS_VEC0; blockC00 = FUSED_OPS_RESULT_VEC0; }
+    { FUSED_OPS_VEC1; blockC01 = FUSED_OPS_RESULT_VEC1; }
 #endif
+     intel_sub_group_block_write8(
+             (__global unsigned int *)(&dst_write0[0]), as_uint8(blockC00));
+     intel_sub_group_block_write8(
+             (__global unsigned int *)(&dst_write0[8 * OC_BLOCK]),
+             as_uint8(blockC01));
+    }
+#endif // ver_16mb16c
 
 #ifdef VER_8OW16C
 #if IC == 3
@@ -507,7 +525,7 @@ const float sum_scale = 1;
     }
 #endif
 
-#else
+#else // IC == 3
     const int sp = get_group_id(1);
     const int local_id = get_local_id(0);
 #if GROUPED
@@ -874,9 +892,18 @@ const float sum_scale = 1;
             if (local_id < 8)
                 dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
 #else
-            intel_sub_group_block_write(
-                    (__global unsigned int *)(&dst_write0[i * OC_BLOCK]),
-                    as_uint(blockC00[i]));
+
+#if OUTPUT_LEFTOVERS
+            if ((oc+1)*OC_BLOCK >= OC_NOTALLIGNED) {
+                if (oc * OC_BLOCK + local_id < OC_NOTALLIGNED)
+                    dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
+            } else
+#endif
+            {
+                intel_sub_group_block_write(
+                        (__global unsigned int *)(&dst_write0[i * OC_BLOCK]),
+                        as_uint(blockC00[i]));
+            }
 #endif
         }
     } else {
@@ -893,9 +920,18 @@ const float sum_scale = 1;
             if (local_id < 8)
                 dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
 #else
-            intel_sub_group_block_write(
-                    (__global unsigned int *)(&dst_write0[i * OC_BLOCK]),
-                    as_uint(blockC00[i]));
+
+#if OUTPUT_LEFTOVERS
+            if ((oc+1)*OC_BLOCK >= OC_NOTALLIGNED) {
+                if (oc * OC_BLOCK + local_id < OC_NOTALLIGNED)
+                    dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
+            } else
+#endif
+            {
+                intel_sub_group_block_write(
+                        (__global unsigned int *)(&dst_write0[i * OC_BLOCK]),
+                        as_uint(blockC00[i]));
+            }
 #endif //  OC == 8 && G != 1
         }
 #else
@@ -910,12 +946,28 @@ const float sum_scale = 1;
             dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
     }
 #else
+
+#if OUTPUT_LEFTOVERS
+    if ((oc+1)*OC_BLOCK >= OC_NOTALLIGNED) {
+        for (int i = 0; i < 8; i++) {
+
 #if HAS_FUSED_OPS
-    { FUSED_OPS_VEC0; blockC00 = FUSED_OPS_RESULT_VEC0; }
+            { FUSED_OPS_SCALAR0; blockC00[i] = FUSED_OPS_RESULT_SCALAR0; }
+#endif
+            if (oc * OC_BLOCK + local_id < OC_NOTALLIGNED)
+                dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
+        }
+    } else
 #endif
+    {
 
+#if HAS_FUSED_OPS
+    { FUSED_OPS_VEC0; blockC00 = FUSED_OPS_RESULT_VEC0; }
+#endif
     intel_sub_group_block_write8(
             (__global unsigned int *)(&dst_write0[0]), as_uint8(blockC00));
+
+    }
 #endif //  OC == 8 && G != 1
 #if OW_BLOCK == 16
 
@@ -929,13 +981,29 @@ const float sum_scale = 1;
             dst_write0[(i + 8) * OC_BLOCK + local_id] = blockC01[i];
     }
 #else
+#if OUTPUT_LEFTOVERS
+    if ((oc+1)*OC_BLOCK >= OC_NOTALLIGNED) {
+        for (int i = 0; i < 8; i++) {
+
 #if HAS_FUSED_OPS
-    { FUSED_OPS_VEC1; blockC01 = FUSED_OPS_RESULT_VEC1; }
+            { FUSED_OPS_SCALAR1; blockC01[i] = FUSED_OPS_RESULT_SCALAR1; }
 #endif
+            if (oc * OC_BLOCK + local_id < OC_NOTALLIGNED)
+                dst_write0[(i + 8) * OC_BLOCK + local_id] = blockC01[i];
+        }
+    } else
+#endif
+    {
 
-    intel_sub_group_block_write8(
-            (__global unsigned int *)(&dst_write0[8 * OC_BLOCK]),
-            as_uint8(blockC01));
+#if HAS_FUSED_OPS
+        { FUSED_OPS_VEC1; blockC01 = FUSED_OPS_RESULT_VEC1; }
+#endif
+
+        intel_sub_group_block_write8(
+                (__global unsigned int *)(&dst_write0[8 * OC_BLOCK]),
+                as_uint8(blockC01));
+
+    }
 #endif //  OC == 8 && G != 1
 #endif
 #endif
@@ -943,7 +1011,7 @@ const float sum_scale = 1;
     }
 #endif
 
-#endif
+#endif // IC == 3
 #endif
     return;
 }
index bac02a2..c6725d2 100644 (file)
@@ -451,16 +451,6 @@ bool layout_optimizer::convolution_b_fs_zyx_fsv16_opt(layout const &input_layout
                                                       const layout &weights_layout,
                                                       std::shared_ptr<const convolution> conv) {
     // A set of rules that define when b_fs_zyx_fsv16 mem format can be used
-    if ((input_layout.format == format::bfzyx ||
-         input_layout.format == format::b_fs_zyx_fsv16 ||
-         input_layout.format == format::bs_fs_zyx_bsv16_fsv16) &&
-        (input_layout.data_type == data_types::f32 || input_layout.data_type == data_types::f16) &&
-        ((input_layout.size.feature[0] / conv->split()) % 16 == 0 || input_layout.size.feature[0] == 3) &&
-        weights_layout.data_type == input_layout.data_type &&
-        (weights_layout.size.batch[0] % 16 == 0 || (weights_layout.size.batch[0] == 8 && conv->groups > 1)) &&
-        conv->dilation == tensor(1))
-        return true;
-
     size_t in_features_per_group = input_layout.size.feature[0] / conv->groups;
     size_t out_features_per_group = weights_layout.size.batch[0] / conv->groups;
     if (weights_layout.format.group_num() > 0) {
@@ -474,9 +464,17 @@ bool layout_optimizer::convolution_b_fs_zyx_fsv16_opt(layout const &input_layout
         (weights_layout.data_type == data_types::i8 || weights_layout.data_type == data_types::u8) &&
         (!((conv->groups > 1) && (in_features_per_group == 1) && (out_features_per_group == 1))))
         return true;
-    return false;
-}
 
+    bool format_ver = (input_layout.format == format::bfzyx || input_layout.format == format::b_fs_zyx_fsv16 ||
+                      input_layout.format == format::bs_fs_zyx_bsv16_fsv16);
+    bool data_type_ver = input_layout.data_type == data_types::f16 || input_layout.data_type == data_types::f32;
+    bool w_layout = weights_layout.data_type == input_layout.data_type;
+    bool single_dilation = conv->dilation == tensor(1);
+    bool groups_ver = conv->groups == 1 || weights_layout.size.batch[0] % 16 == 0
+        || (conv->groups > 1 && weights_layout.size.batch[0] == 8);
+
+    return format_ver && data_type_ver && w_layout && single_dilation && groups_ver;
+}
 bool layout_optimizer::convolution_bs_fs_yx_bsv16_fsv16_opt(const layout &input_layout,
                                                             const layout& weights_layout,
                                                             std::shared_ptr<const convolution> conv) {
index f245b05..ac86072 100644 (file)
@@ -4870,12 +4870,13 @@ using TestParamType_convolution_gpu = ::testing::tuple<int,   // 0 - Filter size
                                                        bool>; // 4 - With bias
 
 using TestParamType_convolution_gpu_block_layout = ::testing::tuple<int,   // 0 -Batch size
-        int,   // 1 - Input features
-        int,   // 2 - Output features
-        int,   // 3 - Filter size
-        int,   // 4 - Stride
-        int,   // 5 - Output padding
-        bool>; // 6 - With bias
+        int,  // 1 - Input features
+        int,  // 2 - Output features
+        int,  // 3 - Filter size
+        int,  // 4 - Stride
+        int,  // 5 - Output padding
+        bool, // 6 - With bias
+        int>; // 7 - Input X/Y size
 
 
 using TestParamType_convolution_depthwise_gpu = ::testing::tuple<int,   // 0 - Input XY size
@@ -5688,47 +5689,120 @@ TEST(convolution_gpu, bfyx_iyxo_5x5_fp16)
 
 }
 
-INSTANTIATE_TEST_CASE_P(convolution_gpu_block,
-                        convolution_gpu_block_layout,
+template<typename T>
+void blockedFormatZeroCheck(cldnn::memory out_mem) {
+    auto out_ptr = out_mem.pointer<T>();
+
+    bool batch_blocked = false;
+    if (out_mem.get_layout().format == format::bs_fs_zyx_bsv16_fsv16 ||
+        out_mem.get_layout().format == format::bs_fs_yx_bsv16_fsv16)
+        batch_blocked = true;
+    const int block_size = 16;
+
+    auto output_tensor = out_mem.get_layout().get_buffer_size();
+    const int b = output_tensor.batch[0];
+    const int f = output_tensor.feature[0];
+    const int spatials = std::accumulate(output_tensor.spatial.begin(), output_tensor.spatial.end(), 1, std::multiplies<int>());
+    const int f_mod = output_tensor.feature[0] % block_size;
+    const size_t batch_skip = batch_blocked ? b / block_size : b;
+    const size_t number_of_zeroes = f_mod == 0 ? 0 : (block_size - f_mod) * spatials * b;
+
+    size_t to_skip = (output_tensor.feature[0] / block_size) * block_size * spatials;
+    to_skip *= batch_blocked ? block_size : 1;
+    size_t zero_ind = to_skip + f_mod;
+
+    size_t i = 0;
+    while (i < number_of_zeroes) {
+        size_t f_tmp = f_mod;
+        while (f_tmp % 16 != 0) {
+            auto equal = are_equal(out_ptr[zero_ind], 0, 1e-2f);
+            EXPECT_TRUE(equal);
+            if (!equal) {
+                std::cout << "Should be zero idx: " << zero_ind << std::endl;
+                return;
+            }
+            f_tmp++;
+            zero_ind++;
+            i++;
+        }
+        // skip on new batch
+        if (i % (number_of_zeroes / batch_skip) == 0)
+            zero_ind += to_skip;
+        if (zero_ind >= (size_t)b*f*spatials)
+            return;
+
+        zero_ind += f_mod;
+    }
+}
+struct convolution_gpu_block_layout3D : public convolution_gpu_block_layout {};
+INSTANTIATE_TEST_CASE_P(convolution_gpu_block3D,
+                        convolution_gpu_block_layout3D,
                         ::testing::Values(
-                                TestParamType_convolution_gpu_block_layout(16, 64, 64, 1, 1, 0, false),
-                                TestParamType_convolution_gpu_block_layout(16, 16, 16, 3, 1, 0, false),
-                                TestParamType_convolution_gpu_block_layout(32, 16, 16, 3, 1, 0, false),
-                                TestParamType_convolution_gpu_block_layout(16, 32, 16, 3, 1, 0, false),
-                                TestParamType_convolution_gpu_block_layout(16, 16, 32, 3, 1, 0, false),
-                                TestParamType_convolution_gpu_block_layout(32, 16, 32, 3, 1, 0, false),
-                                TestParamType_convolution_gpu_block_layout(16, 32, 32, 3, 1, 0, false),
-                                TestParamType_convolution_gpu_block_layout(16, 64, 16, 3, 1, 0, false),
-                                TestParamType_convolution_gpu_block_layout(32, 32, 16, 3, 1, 0, false),
-                                TestParamType_convolution_gpu_block_layout(32, 32, 32, 3, 1, 0, false),
-                                TestParamType_convolution_gpu_block_layout(32, 32, 32, 1, 1, 0, false),
-                                TestParamType_convolution_gpu_block_layout(32, 64, 64, 1, 1, 0, false),
-                                TestParamType_convolution_gpu_block_layout(16, 64, 64, 1, 1, 0, true),
-                                TestParamType_convolution_gpu_block_layout(16, 16, 16, 3, 1, 0, true),
-                                TestParamType_convolution_gpu_block_layout(32, 16, 16, 3, 1, 0, true),
-                                TestParamType_convolution_gpu_block_layout(16, 32, 16, 3, 1, 0, true),
-                                TestParamType_convolution_gpu_block_layout(16, 16, 32, 3, 1, 0, true),
-                                TestParamType_convolution_gpu_block_layout(32, 16, 32, 3, 1, 0, true),
-                                TestParamType_convolution_gpu_block_layout(16, 32, 32, 3, 1, 0, true),
-                                TestParamType_convolution_gpu_block_layout(16, 64, 16, 3, 1, 0, true),
-                                TestParamType_convolution_gpu_block_layout(32, 32, 16, 3, 1, 0, true),
-                                TestParamType_convolution_gpu_block_layout(32, 32, 32, 3, 1, 0, true),
-                                TestParamType_convolution_gpu_block_layout(64, 64, 64, 3, 1, 0, true)),
+                                TestParamType_convolution_gpu_block_layout(1, 3, 10, 1, 1, 0, false, 16),
+                                TestParamType_convolution_gpu_block_layout(4, 4, 17, 3, 1, 0, false, 16),
+                                TestParamType_convolution_gpu_block_layout(15, 17, 15, 3, 1, 0, false, 4),
+                                TestParamType_convolution_gpu_block_layout(1, 17, 16, 1, 1, 0, false, 4),
+                                TestParamType_convolution_gpu_block_layout(1, 17, 20, 3, 1, 0, false, 9),
+                                TestParamType_convolution_gpu_block_layout(4, 15, 17, 1, 1, 0, false, 5),
+                                TestParamType_convolution_gpu_block_layout(1, 2, 15, 3, 1, 0, true, 16),
+                                TestParamType_convolution_gpu_block_layout(17, 2, 16, 3, 1, 0, true, 32),
+                                TestParamType_convolution_gpu_block_layout(30, 2, 5, 1, 1, 0, true, 8),
+                                TestParamType_convolution_gpu_block_layout(2, 1, 7, 3, 1, 0, true, 8),
+                                TestParamType_convolution_gpu_block_layout(5, 16, 1, 1, 1, 0, true, 5),
+
+                                TestParamType_convolution_gpu_block_layout(32, 4, 15, 1, 1, 0, false, 5),
+                                TestParamType_convolution_gpu_block_layout(32, 2, 16, 3, 1, 0, false, 8),
+                                TestParamType_convolution_gpu_block_layout(32, 4, 17, 3, 1, 0, false, 1),
+                                TestParamType_convolution_gpu_block_layout(32, 17, 15, 1, 1, 0, false, 16),
+                                TestParamType_convolution_gpu_block_layout(32, 15, 15, 1, 1, 0, true, 32),
+                                TestParamType_convolution_gpu_block_layout(32, 2, 1, 3, 3, 0, true, 10),
+                                TestParamType_convolution_gpu_block_layout(32, 17, 1, 1, 1, 0, true, 8),
+
+                                TestParamType_convolution_gpu_block_layout(16, 1, 17, 1, 1, 0, true, 10),
+                                TestParamType_convolution_gpu_block_layout(16, 3, 15, 1, 1, 0, false, 8),
+                                TestParamType_convolution_gpu_block_layout(16, 1, 17, 3, 1, 0, false, 1),
+                                TestParamType_convolution_gpu_block_layout(16, 15, 17, 1, 1, 0, false, 3),
+                                TestParamType_convolution_gpu_block_layout(16, 16, 17, 3, 1, 0, false, 16),
+                                TestParamType_convolution_gpu_block_layout(16, 17, 15, 3, 1, 0, false, 32),
+                                TestParamType_convolution_gpu_block_layout(16, 17, 16, 1, 1, 0, true, 17),
+                                TestParamType_convolution_gpu_block_layout(16, 3, 7, 3, 1, 0, true, 8),
+                                TestParamType_convolution_gpu_block_layout(16, 5, 10, 1, 1, 0, true, 8),
+                                TestParamType_convolution_gpu_block_layout(16, 17, 6, 3, 1, 0, true, 15),
+                                TestParamType_convolution_gpu_block_layout(16, 16, 1, 1, 1, 0, true, 2),
+
+                                TestParamType_convolution_gpu_block_layout(16, 32, 16, 3, 1, 0, false, 5),
+                                TestParamType_convolution_gpu_block_layout(16, 16, 32, 3, 1, 0, false, 3),
+                                TestParamType_convolution_gpu_block_layout(32, 16, 32, 3, 1, 0, false, 9),
+                                TestParamType_convolution_gpu_block_layout(16, 32, 32, 3, 1, 0, false, 8),
+                                TestParamType_convolution_gpu_block_layout(16, 64, 16, 3, 1, 0, false, 5),
+                                TestParamType_convolution_gpu_block_layout(32, 32, 16, 3, 1, 0, false, 5),
+                                TestParamType_convolution_gpu_block_layout(32, 32, 32, 3, 1, 0, false, 3),
+                                TestParamType_convolution_gpu_block_layout(32, 32, 32, 1, 1, 0, false, 2),
+                                TestParamType_convolution_gpu_block_layout(32, 16, 16, 3, 1, 0, true, 16),
+                                TestParamType_convolution_gpu_block_layout(16, 32, 16, 3, 1, 0, true, 5),
+                                TestParamType_convolution_gpu_block_layout(32, 16, 32, 3, 1, 0, true, 5),
+                                TestParamType_convolution_gpu_block_layout(16, 64, 16, 3, 1, 0, true, 3),
+                                TestParamType_convolution_gpu_block_layout(32, 32, 16, 3, 1, 0, true, 2),
+                                TestParamType_convolution_gpu_block_layout(32, 32, 32, 3, 1, 0, true, 2),
+                                TestParamType_convolution_gpu_block_layout(64, 64, 64, 3, 1, 0, true, 2)),
                         convolution_gpu_block_layout::PrintToStringParamName);
 
-TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp32)
+TEST_P(convolution_gpu_block_layout3D, bfzyx_bsv16_fsv16_fp32)
 {
     const auto& engine = get_test_engine();
 
     const int batch_num = testing::get<0>(GetParam());
-    const int input_xy = 5;
     const int input_f = testing::get<1>(GetParam());
     const int output_f = testing::get<2>(GetParam());
     const int filter_xy = testing::get<3>(GetParam());
     const int stride = testing::get<4>(GetParam());
     const int output_padding = testing::get<5>(GetParam());
     const bool with_bias = testing::get<6>(GetParam());
+    const int input_xy = testing::get<7>(GetParam());
     const int input_offset = -(filter_xy / 2);
+    format input_format = format::b_fs_zyx_fsv16;
+    if (batch_num % 16 == 0)
+        input_format = format::bs_fs_zyx_bsv16_fsv16;
 
     auto input_size = tensor(batch_num, input_f, input_xy, input_xy, 1);
     auto input_data = generate_random_4d<float>(batch_num, input_f, input_xy, input_xy, 1, 10);
@@ -5751,8 +5825,8 @@ TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp32)
             input_layout("input", input_mem.get_layout()),
             data("weights", weights_mem));
 
-    // Reorder input to bs_fs_yx_bsv16_fsv16
-    topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f32, format::bs_fs_zyx_bsv16_fsv16, input_size }));
+    // Reorder input to correct format
+    topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f32, input_format, input_size }));
 
     if (with_bias)
     {
@@ -5825,7 +5899,9 @@ TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp32)
     auto out_mem_bfyx = network.get_output("reorder_bfzyx").get_memory();
     auto out_ptr_bfyx = out_mem_bfyx.pointer<float>();
 
-    ASSERT_EQ(out_mem.get_layout().format, format::bs_fs_zyx_bsv16_fsv16);
+    blockedFormatZeroCheck<float>(out_mem);
+
+    ASSERT_EQ(out_mem.get_layout().format, input_format);
 
     auto flatten_ref = flatten_4d(format::bfyx, reference_result);
 
@@ -5838,9 +5914,10 @@ TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp32)
             return;
         }
     }
+
 }
 
-TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp16)
+TEST_P(convolution_gpu_block_layout3D, bfzyx_bsv16_fsv16_fp16)
 {
     const auto& engine = get_test_engine();
 
@@ -5852,21 +5929,17 @@ TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp16)
     }
 
     const int batch_num = testing::get<0>(GetParam());
-    const int input_xy = 5;
     const int input_f = testing::get<1>(GetParam());
     const int output_f = testing::get<2>(GetParam());
     const int filter_xy = testing::get<3>(GetParam());
     const int stride = testing::get<4>(GetParam());
     const int output_padding = testing::get<5>(GetParam());
     const bool with_bias = testing::get<6>(GetParam());
+    const int input_xy = testing::get<7>(GetParam());
     const int input_offset = -(filter_xy / 2);
-
-    if (batch_num % 32 != 0)
-    {
-        std::cout << "[ SKIPPED ] The test is skipped (for fp16 batch should be multiple of 32)." << std::endl;
-        EXPECT_EQ(1, 1);
-        return;
-    }
+    format input_format = format::b_fs_zyx_fsv16;
+    if (batch_num % 32 == 0)
+        input_format = format::bs_fs_zyx_bsv16_fsv16;
 
     auto input_size = tensor(batch_num, input_f, input_xy, input_xy, 1);
     auto input_data = generate_random_4d<FLOAT16>(batch_num, input_f, input_xy, input_xy, 0, 1);
@@ -5890,8 +5963,8 @@ TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp16)
             input_layout("input", input_mem.get_layout()),
             data("weights", weights_mem));
 
-    // Reorder input to bs_fs_zyx_bsv16_fsv16
-    topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f16, format::bs_fs_zyx_bsv16_fsv16, input_size }));
+    // Reorder input to correct format
+    topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f16, input_format, input_size }));
 
     if (with_bias)
     {
@@ -5964,7 +6037,9 @@ TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp16)
     auto out_mem_bfyx = network.get_output("reorder_bfzyx").get_memory();
     auto out_ptr_bfyx = out_mem_bfyx.pointer<FLOAT16>();
 
-    ASSERT_EQ(out_mem.get_layout().format, format::bs_fs_zyx_bsv16_fsv16);
+    blockedFormatZeroCheck<FLOAT16>(out_mem);
+
+    ASSERT_EQ(out_mem.get_layout().format, input_format);
 
     auto flatten_ref = flatten_4d(format::bfyx, reference_result);
 
@@ -5979,19 +6054,22 @@ TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp16)
     }
 }
 
-TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp32_fused_ops)
+TEST_P(convolution_gpu_block_layout3D, bfzyx_bsv16_fsv16_fp32_fused_ops)
 {
     const auto& engine = get_test_engine();
 
     const int batch_num = testing::get<0>(GetParam());
-    const int input_xy = 5;
     const int input_f = testing::get<1>(GetParam());
     const int output_f = testing::get<2>(GetParam());
     const int filter_xy = testing::get<3>(GetParam());
     const int stride = testing::get<4>(GetParam());
     const int output_padding = testing::get<5>(GetParam());
     const bool with_bias = testing::get<6>(GetParam());
+    const int input_xy = testing::get<7>(GetParam());
     const int input_offset = -(filter_xy / 2);
+    format input_format = format::b_fs_zyx_fsv16;
+    if (batch_num % 16 == 0)
+        input_format = format::bs_fs_zyx_bsv16_fsv16;
 
     auto input_size = tensor(batch_num, input_f, input_xy, input_xy, 1);
     auto input_data = generate_random_4d<float>(batch_num, input_f, input_xy, input_xy, 1, 10);
@@ -6014,8 +6092,8 @@ TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp32_fused_ops)
             input_layout("input", input_mem.get_layout()),
             data("weights", weights_mem));
 
-    // Reorder input to bs_fs_yx_bsv16_fsv16
-    topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f32, format::bs_fs_zyx_bsv16_fsv16, input_size }));
+    // Reorder input to correct format
+    topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f32, input_format, input_size }));
 
     if (with_bias)
     {
@@ -6095,7 +6173,9 @@ TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp32_fused_ops)
     auto out_mem_bfyx = network.get_output("reorder_bfzyx").get_memory();
     auto out_ptr_bfyx = out_mem_bfyx.pointer<float>();
 
-    ASSERT_EQ(out_mem.get_layout().format, format::bs_fs_zyx_bsv16_fsv16);
+    blockedFormatZeroCheck<float>(out_mem);
+
+    ASSERT_EQ(out_mem.get_layout().format, input_format);
 
     auto flatten_ref = flatten_4d(format::bfyx, reference_result);
 
@@ -6110,6 +6190,34 @@ TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp32_fused_ops)
     }
 }
 
+INSTANTIATE_TEST_CASE_P(convolution_gpu_block,
+                        convolution_gpu_block_layout,
+                        ::testing::Values(
+                                TestParamType_convolution_gpu_block_layout(16, 64, 64, 1, 1, 0, false, 0),
+                                TestParamType_convolution_gpu_block_layout(16, 16, 16, 3, 1, 0, false, 0),
+                                TestParamType_convolution_gpu_block_layout(32, 16, 16, 3, 1, 0, false, 0),
+                                TestParamType_convolution_gpu_block_layout(16, 32, 16, 3, 1, 0, false, 0),
+                                TestParamType_convolution_gpu_block_layout(16, 16, 32, 3, 1, 0, false, 0),
+                                TestParamType_convolution_gpu_block_layout(32, 16, 32, 3, 1, 0, false, 0),
+                                TestParamType_convolution_gpu_block_layout(16, 32, 32, 3, 1, 0, false, 0),
+                                TestParamType_convolution_gpu_block_layout(16, 64, 16, 3, 1, 0, false, 0),
+                                TestParamType_convolution_gpu_block_layout(32, 32, 16, 3, 1, 0, false, 0),
+                                TestParamType_convolution_gpu_block_layout(32, 32, 32, 3, 1, 0, false, 0),
+                                TestParamType_convolution_gpu_block_layout(32, 32, 32, 1, 1, 0, false, 0),
+                                TestParamType_convolution_gpu_block_layout(32, 64, 64, 1, 1, 0, false, 0),
+                                TestParamType_convolution_gpu_block_layout(16, 64, 64, 1, 1, 0, true, 0),
+                                TestParamType_convolution_gpu_block_layout(16, 16, 16, 3, 1, 0, true, 0),
+                                TestParamType_convolution_gpu_block_layout(32, 16, 16, 3, 1, 0, true, 0),
+                                TestParamType_convolution_gpu_block_layout(16, 32, 16, 3, 1, 0, true, 0),
+                                TestParamType_convolution_gpu_block_layout(16, 16, 32, 3, 1, 0, true, 0),
+                                TestParamType_convolution_gpu_block_layout(32, 16, 32, 3, 1, 0, true, 0),
+                                TestParamType_convolution_gpu_block_layout(16, 32, 32, 3, 1, 0, true, 0),
+                                TestParamType_convolution_gpu_block_layout(16, 64, 16, 3, 1, 0, true, 0),
+                                TestParamType_convolution_gpu_block_layout(32, 32, 16, 3, 1, 0, true, 0),
+                                TestParamType_convolution_gpu_block_layout(32, 32, 32, 3, 1, 0, true, 0),
+                                TestParamType_convolution_gpu_block_layout(64, 64, 64, 3, 1, 0, true, 0)),
+                        convolution_gpu_block_layout::PrintToStringParamName);
+
 TEST_P(convolution_gpu_block_layout, bfyx_bsv16_fsv16_fp32)
 {
     const auto& engine = get_test_engine();