}
FusedOpsConfiguration GenerateFusedOpsConfiguration_bsv16_fsv16(size_t conf_id, std::string input_name, Datatype dt,
- size_t dims) {
+ size_t dims, bool is_vector) {
+ std::string suffix = (is_vector ? "_VEC" : "_SCALAR") + std::to_string(conf_id);
+ std::string input_var_name = input_name + std::to_string(conf_id) + (is_vector ? "" : "[i]");
+ size_t vec_size = is_vector ? 8 : 1;
std::vector<std::string> idx_order;
- if (dims == 5)
- idx_order = {"(mb + " + std::to_string(conf_id * 8) + ")", "(oc*16)", "od", "oh", "ow"};
- else
- idx_order = {"(mb + " + std::to_string(conf_id * 8) + ")", "(oc*16)", "oh", "ow"};
+ if (is_vector) {
+ if (dims == 5)
+ idx_order = {"(mb + " + std::to_string(conf_id * 8) + ")", "(oc*16)", "od", "oh", "ow"};
+ else
+ idx_order = {"(mb + " + std::to_string(conf_id * 8) + ")", "(oc*16)", "oh", "ow"};
+ } else {
+ if (dims == 5)
+ idx_order = {"(mb + " + std::to_string(conf_id * 8) + ")", "(oc*16 + local_id)", "od", "oh", "(ow + i)"};
+ else
+ idx_order = {"(mb + " + std::to_string(conf_id * 8) + ")", "(oc*16 + local_id)", "oh", "(ow + i)"};
+ }
- return { "_VEC" + std::to_string(conf_id),
+ return { suffix,
idx_order,
- input_name + std::to_string(conf_id),
+ input_var_name,
dt,
- 8,
- FusedOpsConfiguration::LoadType::LT_ALIGNED_READ,
+ vec_size,
+ is_vector ? FusedOpsConfiguration::LoadType::LT_ALIGNED_READ : FusedOpsConfiguration::LoadType::LT_UNALIGNED,
FusedOpsConfiguration::BoundaryCheck::ENABLED,
FusedOpsConfiguration::IndexType::TENSOR_COORD,
Tensor::DataChannelName::BATCH };
auto b = out.Batch().v;
auto g = params.groups;
- const bool is_1stconv = input.Feature().v == 3;
+ const bool is_1stconv = input.Feature().v == 3 && input.GetLayout() == DataLayout::bfzyx;
const bool ver_16mb16c = !is_1stconv &&
((out.GetDType() == Datatype::F16 && b % 32 == 0) ||
(out.GetDType() == Datatype::F32 && b % 16 == 0));
kd.gws2 = b * f / ocb;
}
} else if (ver_16mb16c) {
- f = f / g;
+ f = (g > 1) ? f/g : Align(f, 16);
kd.lws0 = sub_group_size;
kd.lws1 = 1;
kd.lws2 = 1;
kd.cldnnStyle.blockWidth = 1;
} else {
auto oh_block = 1;
- f = (g > 1) ? Align(f / g, 16) : f;
+ f = Align(f / g, 16);
auto div = 16;
while (div > 1) {
if (output.GetDType() != use_data_type)
return false;
- if (output.Feature().v % feature_block_size != 0)
- return false;
-
if (input.GetLayout() == DataLayout::bfzyx) {
- if (input.Feature().v != 3)
+ if (input.Feature().v != 3 || output.Feature().v % feature_block_size != 0)
return false;
if (output.GetDType() == Datatype::F16 && (output.Feature().v % 32 != 0))
return false;
} else {
- if ((input.Feature().v / params.groups) % feature_block_size != 0 && (input.Feature().v / params.groups) != 8)
+ if ((params.groups > 1) && (input.Feature().v / params.groups) % feature_block_size != 0 &&
+ (input.Feature().v / params.groups) != 8)
return false;
}
auto output = params.output;
auto jit = Parent::GetJitConstants(params, runInfo);
- const bool is_1stconv = input.Feature().v == 3;
- const bool ver_16mb16c = !is_1stconv &&
- ((output.GetDType() == Datatype::F16 && output.Batch().v % 32 == 0) ||
- (output.GetDType() == Datatype::F32 && output.Batch().v % 16 == 0));
+ const bool is_1stconv = input.Feature().v == 3 && input.GetLayout() == DataLayout::bfzyx;
+ const bool ver_16mb16c = !is_1stconv && ((output.GetDType() == Datatype::F16 && output.Batch().v % 32 == 0) ||
+ (output.GetDType() == Datatype::F32 && output.Batch().v % 16 == 0));
if (ver_16mb16c) {
jit.AddConstant(MakeJitConstant("VER_16MB16C", 1));
if (ver_16mb16c && !params.fused_ops.empty()) {
const auto dims_num = DataTensor::ChannelsCount(input.GetLayout());
if (output.GetDType() != Datatype::F16) {
- FusedOpsConfiguration conf_vec0 = GenerateFusedOpsConfiguration_bsv16_fsv16(0, "blockC0", input_dt, dims_num);
- FusedOpsConfiguration conf_vec1 = GenerateFusedOpsConfiguration_bsv16_fsv16(1, "blockC0", input_dt, dims_num);
- jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec0, conf_vec1}));
+ FusedOpsConfiguration conf_vec0 = GenerateFusedOpsConfiguration_bsv16_fsv16(0, "blockC0", input_dt, dims_num, true);
+ FusedOpsConfiguration conf_vec1 = GenerateFusedOpsConfiguration_bsv16_fsv16(1, "blockC0", input_dt, dims_num, true);
+ FusedOpsConfiguration conf_scalar0 = GenerateFusedOpsConfiguration_bsv16_fsv16(0, "blockC0", input_dt, dims_num, false);
+ FusedOpsConfiguration conf_scalar1 = GenerateFusedOpsConfiguration_bsv16_fsv16(1, "blockC0", input_dt, dims_num, false);
+ jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec0, conf_vec1, conf_scalar0, conf_scalar1}));
} else {
- FusedOpsConfiguration conf_vec0 = GenerateFusedOpsConfiguration_bsv16_fsv16(0, "C0", input_dt, dims_num);
- FusedOpsConfiguration conf_vec1 = GenerateFusedOpsConfiguration_bsv16_fsv16(1, "C0", input_dt, dims_num);
- FusedOpsConfiguration conf_vec2 = GenerateFusedOpsConfiguration_bsv16_fsv16(2, "C0", input_dt, dims_num);
- FusedOpsConfiguration conf_vec3 = GenerateFusedOpsConfiguration_bsv16_fsv16(3, "C0", input_dt, dims_num);
- jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec0, conf_vec1, conf_vec2, conf_vec3}));
+ FusedOpsConfiguration conf_vec0 = GenerateFusedOpsConfiguration_bsv16_fsv16(0, "C0", input_dt, dims_num, true);
+ FusedOpsConfiguration conf_vec1 = GenerateFusedOpsConfiguration_bsv16_fsv16(1, "C0", input_dt, dims_num, true);
+ FusedOpsConfiguration conf_vec2 = GenerateFusedOpsConfiguration_bsv16_fsv16(2, "C0", input_dt, dims_num, true);
+ FusedOpsConfiguration conf_vec3 = GenerateFusedOpsConfiguration_bsv16_fsv16(3, "C0", input_dt, dims_num, true);
+ FusedOpsConfiguration conf_scalar0 = GenerateFusedOpsConfiguration_bsv16_fsv16(0, "C0", input_dt, dims_num, false);
+ FusedOpsConfiguration conf_scalar1 = GenerateFusedOpsConfiguration_bsv16_fsv16(1, "C0", input_dt, dims_num, false);
+ FusedOpsConfiguration conf_scalar2 = GenerateFusedOpsConfiguration_bsv16_fsv16(2, "C0", input_dt, dims_num, false);
+ FusedOpsConfiguration conf_scalar3 = GenerateFusedOpsConfiguration_bsv16_fsv16(3, "C0", input_dt, dims_num, false);
+ jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec0, conf_vec1, conf_vec2, conf_vec3,
+ conf_scalar0, conf_scalar1, conf_scalar2, conf_scalar3}));
}
} else if (!is_1stconv && !params.fused_ops.empty()) {
FusedOpsConfiguration conf_vec0 = GenerateFusedOpsConfiguration_f16(0, "blockC0", input_dt, true);
jit.AddConstant(MakeJitConstant("IS_DW", "DEPTHWISE_SEPARABLE_OPT"));
jit.AddConstant(MakeJitConstant("WITH_BIAS", "BIAS_TERM"));
+ if (is_1stconv || params.groups > 1) {
+ jit.AddConstant(MakeJitConstant("OC", output.Feature().v / params.groups));
+ jit.AddConstant(MakeJitConstant("IC", input.Feature().v / params.groups));
+ } else {
+ jit.AddConstant(MakeJitConstant("OC", Align(output.Feature().v, 16)));
+ jit.AddConstant(MakeJitConstant("IC", Align(input.Feature().v, 16)));
+ }
+
jit.AddConstant(MakeJitConstant("MB", "OUTPUT_BATCH_NUM"));
- jit.AddConstant(MakeJitConstant("OC", output.Feature().v / params.groups));
jit.AddConstant(MakeJitConstant("OD", "OUTPUT_SIZE_Z"));
jit.AddConstant(MakeJitConstant("OH", "OUTPUT_SIZE_Y"));
jit.AddConstant(MakeJitConstant("OW", "OUTPUT_SIZE_X"));
- jit.AddConstant(MakeJitConstant("IC", input.Feature().v / params.groups));
+
jit.AddConstant(MakeJitConstant("ID", "INPUT0_SIZE_Z"));
jit.AddConstant(MakeJitConstant("IH", "INPUT0_SIZE_Y"));
jit.AddConstant(MakeJitConstant("IW", "INPUT0_SIZE_X"));
jit.AddConstant(MakeJitConstant("PH_R", "PADDING_SIZE_Y"));
jit.AddConstant(MakeJitConstant("PW_R", "PADDING_SIZE_X"));
- jit.AddConstant(MakeJitConstant("IC_FULL", params.inputs[0].Feature().LogicalDimPadded()));
+ if (is_1stconv || params.groups > 1) {
+ jit.AddConstant(MakeJitConstant("IC_FULL", params.inputs[0].Feature().LogicalDimPadded()));
+ jit.AddConstant(MakeJitConstant("OC_FULL", params.output.Feature().LogicalDimPadded()));
+ } else {
+ jit.AddConstant(MakeJitConstant("IC_FULL", Align(params.inputs[0].Feature().LogicalDimPadded(), 16)));
+ jit.AddConstant(MakeJitConstant("OC_FULL", Align(params.output.Feature().LogicalDimPadded(), 16)));
+ }
+
jit.AddConstant(MakeJitConstant("ID_FULL", params.inputs[0].Z().LogicalDimPadded()));
jit.AddConstant(MakeJitConstant("IH_FULL", params.inputs[0].Y().LogicalDimPadded()));
jit.AddConstant(MakeJitConstant("IW_FULL", params.inputs[0].X().LogicalDimPadded()));
-
- jit.AddConstant(MakeJitConstant("OC_FULL", params.output.Feature().LogicalDimPadded()));
jit.AddConstant(MakeJitConstant("OD_FULL", params.output.Z().LogicalDimPadded()));
jit.AddConstant(MakeJitConstant("OH_FULL", params.output.Y().LogicalDimPadded()));
jit.AddConstant(MakeJitConstant("OW_FULL", params.output.X().LogicalDimPadded()));
+ if (params.output.Feature().v % feature_block_size != 0) {
+ jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
+ jit.AddConstant(MakeJitConstant("OC_NOTALLIGNED", output.Feature().v));
+ }
+
return jit;
}
/*******************************************************************************
-* Copyright 2019 Intel Corporation
+* Copyright 2019-2020 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
const int oc = get_group_id(0);
#endif
#endif
+ const int local_id = get_local_id(0);
const int sp = get_group_id(1);
int mb = get_group_id(2) * MB_BLOCK * 2;
const int ow = ohw % OW;
#if WITH_BIAS
- const int local_id = get_local_id(0);
half8 C00 = bias[oc * OC_BLOCK + local_id + g * OC];
half8 C01 = C00, C02 = C00, C03 = C00;
#if USE_32OC_UNROLL
#endif
#endif
+#if OUTPUT_LEFTOVERS
+ if ((oc+1)*OC_BLOCK >= OC_NOTALLIGNED) {
+ for (int i = 0; i < 8; i++) {
+
#if HAS_FUSED_OPS
- { FUSED_OPS_VEC0; C00 = FUSED_OPS_RESULT_VEC0; }
- { FUSED_OPS_VEC1; C01 = FUSED_OPS_RESULT_VEC1; }
+ { FUSED_OPS_SCALAR0; C00[i] = FUSED_OPS_RESULT_SCALAR0; }
+ { FUSED_OPS_SCALAR1; C00[i] = FUSED_OPS_RESULT_SCALAR1; }
#endif
-
- intel_sub_group_block_write_us8(
+ if (oc * OC_BLOCK + local_id < OC_NOTALLIGNED) {
+ dst_write0[i * OC_BLOCK + local_id] = C00[i];
+ dst_write0[8 * OC_BLOCK + i * OC_BLOCK + local_id] = C01[i];
+ }
+ }
+ } else
+#endif // OUTPUT_LEFTOVERS
+ {
+#if HAS_FUSED_OPS
+ { FUSED_OPS_VEC0; C00 = FUSED_OPS_RESULT_VEC0; }
+ { FUSED_OPS_VEC1; C01 = FUSED_OPS_RESULT_VEC1; }
+#endif
+ intel_sub_group_block_write_us8(
(__global ushort *)dst_write0, as_ushort8(C00));
- intel_sub_group_block_write_us8(
+ intel_sub_group_block_write_us8(
(__global ushort *)&dst_write0[8 * OC_BLOCK], as_ushort8(C01));
#if USE_32OC_UNROLL
- intel_sub_group_block_write_us8(
+ intel_sub_group_block_write_us8(
(__global ushort *)&dst_write1[0], as_ushort8(C10));
- intel_sub_group_block_write_us8(
+ intel_sub_group_block_write_us8(
(__global ushort *)&dst_write1[8 * OC_BLOCK], as_ushort8(C11));
#endif
+ }
#if WITH_SUM == 1
half8 blockS02 = as_half8(
#endif
#endif
-#if HAS_FUSED_OPS
- { FUSED_OPS_VEC2; C02 = FUSED_OPS_RESULT_VEC2; }
- { FUSED_OPS_VEC3; C03 = FUSED_OPS_RESULT_VEC3; }
-#endif
+#if OUTPUT_LEFTOVERS
+ if ((oc+1)*OC_BLOCK >= OC_NOTALLIGNED) {
+ for (int i = 0; i < 8; i++) {
- intel_sub_group_block_write_us8(
- (__global ushort *)&dst_write0[MB_BLOCK * OC_FULL * ODHW_SIZE],
- as_ushort8(C02));
- intel_sub_group_block_write_us8(
- (__global ushort *)&dst_write0[MB_BLOCK * OC_FULL * ODHW_SIZE
- + 8 * OC_BLOCK],
- as_ushort8(C03));
+#if HAS_FUSED_OPS
+ { FUSED_OPS_SCALAR0; C02[i] = FUSED_OPS_RESULT_SCALAR0; }
+ { FUSED_OPS_SCALAR1; C03[i] = FUSED_OPS_RESULT_SCALAR1; }
+#endif
+ if (oc * OC_BLOCK + local_id < OC_NOTALLIGNED) {
+ dst_write0[MB_BLOCK * OC_FULL * ODHW_SIZE
+ + i * OC_BLOCK + local_id] = C02[i];
+ dst_write0[MB_BLOCK * OC_FULL * ODHW_SIZE + 8 * OC_BLOCK
+ + i * OC_BLOCK + local_id] = C03[i];
+ }
+ }
+ } else
+#endif // OUTPUT_LEFTOVERS
+ {
+#if HAS_FUSED_OPS
+ { FUSED_OPS_VEC2; C02 = FUSED_OPS_RESULT_VEC2; }
+ { FUSED_OPS_VEC3; C03 = FUSED_OPS_RESULT_VEC3; }
+#endif
+ intel_sub_group_block_write_us8(
+ (__global ushort *)&dst_write0[MB_BLOCK * OC_FULL * ODHW_SIZE],
+ as_ushort8(C02));
+ intel_sub_group_block_write_us8(
+ (__global ushort *)&dst_write0[MB_BLOCK * OC_FULL * ODHW_SIZE
+ + 8 * OC_BLOCK],
+ as_ushort8(C03));
#if USE_32OC_UNROLL
- intel_sub_group_block_write_us8(
- (__global ushort *)&dst_write1[MB_BLOCK * OC_FULL * ODHW_SIZE],
- as_ushort8(C12));
- intel_sub_group_block_write_us8(
- (__global ushort *)&dst_write1[MB_BLOCK * OC_FULL * ODHW_SIZE
- + 8 * OC_BLOCK],
- as_ushort8(C13));
+ intel_sub_group_block_write_us8(
+ (__global ushort *)&dst_write1[MB_BLOCK * OC_FULL * ODHW_SIZE],
+ as_ushort8(C12));
+ intel_sub_group_block_write_us8(
+ (__global ushort *)&dst_write1[MB_BLOCK * OC_FULL * ODHW_SIZE
+ + 8 * OC_BLOCK],
+ as_ushort8(C13));
#endif
+ }
#endif
#if VER_8OW16C == 1 && (IC % 16 == 0 || (IC == 8 && G != 1))
if (local_id < 8)
dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
#else
- intel_sub_group_block_write_us(
- (__global ushort *)(&dst_write0[i * OC_BLOCK]),
- as_ushort(blockC00[i]));
+
+#if OUTPUT_LEFTOVERS
+ if ((oc+1)*OC_BLOCK >= OC_NOTALLIGNED) {
+ if (oc * OC_BLOCK + local_id < OC_NOTALLIGNED)
+ dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
+ } else
+#endif
+ {
+ intel_sub_group_block_write_us(
+ (__global ushort *)(&dst_write0[i * OC_BLOCK]),
+ as_ushort(blockC00[i]));
+ }
#endif
}
} else {
if (local_id < 8)
dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
#else
- intel_sub_group_block_write_us(
- (__global ushort *)(&dst_write0[i * OC_BLOCK]),
- as_ushort(blockC00[i]));
+
+#if OUTPUT_LEFTOVERS
+ if ((oc+1)*OC_BLOCK >= OC_NOTALLIGNED) {
+ if (oc * OC_BLOCK + local_id < OC_NOTALLIGNED)
+ dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
+ } else
+#endif
+ {
+ intel_sub_group_block_write_us(
+ (__global ushort *)(&dst_write0[i * OC_BLOCK]),
+ as_ushort(blockC00[i]));
+ }
#endif // OC == 8 && G != 1
}
#else
dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
}
#else
+#if OUTPUT_LEFTOVERS
+ if ((oc+1)*OC_BLOCK >= OC_NOTALLIGNED) {
+ for (int i = 0; i < 8; i++) {
+
+#if HAS_FUSED_OPS
+ { FUSED_OPS_SCALAR0; blockC00[i] = FUSED_OPS_RESULT_SCALAR0; }
+#endif
+ if (oc * OC_BLOCK + local_id < OC_NOTALLIGNED)
+ dst_write0[i * OC_BLOCK + local_id] = blockC00[i];
+ }
+ } else
+#endif
+ {
#if HAS_FUSED_OPS
{ FUSED_OPS_VEC0; blockC00 = FUSED_OPS_RESULT_VEC0; }
#endif
- intel_sub_group_block_write_us8(
- (__global ushort *)(&dst_write0[0]), as_ushort8(blockC00));
+ intel_sub_group_block_write_us8(
+ (__global ushort *)(&dst_write0[0]), as_ushort8(blockC00));
+ }
#endif // OC == 8 && G != 1
#if OW_BLOCK == 16
dst_write0[(i + 8) * OC_BLOCK + local_id] = blockC01[i];
}
#else
+
+#if OUTPUT_LEFTOVERS
+ if ((oc+1)*OC_BLOCK >= OC_NOTALLIGNED) {
+ for (int i = 0; i < 8; i++) {
+
+#if HAS_FUSED_OPS
+ { FUSED_OPS_SCALAR1; blockC01[i] = FUSED_OPS_RESULT_SCALAR1; }
+#endif
+ if (oc * OC_BLOCK + local_id < OC_NOTALLIGNED)
+ dst_write0[(i + 8) * OC_BLOCK + local_id] = blockC01[i];
+ }
+ } else
+#endif
+ {
#if HAS_FUSED_OPS
{ FUSED_OPS_VEC1; blockC01 = FUSED_OPS_RESULT_VEC1; }
#endif
intel_sub_group_block_write_us8(
(__global ushort *)(&dst_write0[8 * OC_BLOCK]),
as_ushort8(blockC01));
+ }
#endif // OC == 8 && G != 1
#endif
#endif
bool>; // 4 - With bias
using TestParamType_convolution_gpu_block_layout = ::testing::tuple<int, // 0 -Batch size
- int, // 1 - Input features
- int, // 2 - Output features
- int, // 3 - Filter size
- int, // 4 - Stride
- int, // 5 - Output padding
- bool>; // 6 - With bias
+ int, // 1 - Input features
+ int, // 2 - Output features
+ int, // 3 - Filter size
+ int, // 4 - Stride
+ int, // 5 - Output padding
+ bool, // 6 - With bias
+ int>; // 7 - Input X/Y size
using TestParamType_convolution_depthwise_gpu = ::testing::tuple<int, // 0 - Input XY size
}
-INSTANTIATE_TEST_CASE_P(convolution_gpu_block,
- convolution_gpu_block_layout,
+template<typename T>
+void blockedFormatZeroCheck(cldnn::memory out_mem) {
+ auto out_ptr = out_mem.pointer<T>();
+
+ bool batch_blocked = false;
+ if (out_mem.get_layout().format == format::bs_fs_zyx_bsv16_fsv16 ||
+ out_mem.get_layout().format == format::bs_fs_yx_bsv16_fsv16)
+ batch_blocked = true;
+ const int block_size = 16;
+
+ auto output_tensor = out_mem.get_layout().get_buffer_size();
+ const int b = output_tensor.batch[0];
+ const int f = output_tensor.feature[0];
+ const int spatials = std::accumulate(output_tensor.spatial.begin(), output_tensor.spatial.end(), 1, std::multiplies<int>());
+ const int f_mod = output_tensor.feature[0] % block_size;
+ const size_t batch_skip = batch_blocked ? b / block_size : b;
+ const size_t number_of_zeroes = f_mod == 0 ? 0 : (block_size - f_mod) * spatials * b;
+
+ size_t to_skip = (output_tensor.feature[0] / block_size) * block_size * spatials;
+ to_skip *= batch_blocked ? block_size : 1;
+ size_t zero_ind = to_skip + f_mod;
+
+ size_t i = 0;
+ while (i < number_of_zeroes) {
+ size_t f_tmp = f_mod;
+ while (f_tmp % 16 != 0) {
+ auto equal = are_equal(out_ptr[zero_ind], 0, 1e-2f);
+ EXPECT_TRUE(equal);
+ if (!equal) {
+ std::cout << "Should be zero idx: " << zero_ind << std::endl;
+ return;
+ }
+ f_tmp++;
+ zero_ind++;
+ i++;
+ }
+ // skip on new batch
+ if (i % (number_of_zeroes / batch_skip) == 0)
+ zero_ind += to_skip;
+ if (zero_ind >= (size_t)b*f*spatials)
+ return;
+
+ zero_ind += f_mod;
+ }
+}
+struct convolution_gpu_block_layout3D : public convolution_gpu_block_layout {};
+INSTANTIATE_TEST_CASE_P(convolution_gpu_block3D,
+ convolution_gpu_block_layout3D,
::testing::Values(
- TestParamType_convolution_gpu_block_layout(16, 64, 64, 1, 1, 0, false),
- TestParamType_convolution_gpu_block_layout(16, 16, 16, 3, 1, 0, false),
- TestParamType_convolution_gpu_block_layout(32, 16, 16, 3, 1, 0, false),
- TestParamType_convolution_gpu_block_layout(16, 32, 16, 3, 1, 0, false),
- TestParamType_convolution_gpu_block_layout(16, 16, 32, 3, 1, 0, false),
- TestParamType_convolution_gpu_block_layout(32, 16, 32, 3, 1, 0, false),
- TestParamType_convolution_gpu_block_layout(16, 32, 32, 3, 1, 0, false),
- TestParamType_convolution_gpu_block_layout(16, 64, 16, 3, 1, 0, false),
- TestParamType_convolution_gpu_block_layout(32, 32, 16, 3, 1, 0, false),
- TestParamType_convolution_gpu_block_layout(32, 32, 32, 3, 1, 0, false),
- TestParamType_convolution_gpu_block_layout(32, 32, 32, 1, 1, 0, false),
- TestParamType_convolution_gpu_block_layout(32, 64, 64, 1, 1, 0, false),
- TestParamType_convolution_gpu_block_layout(16, 64, 64, 1, 1, 0, true),
- TestParamType_convolution_gpu_block_layout(16, 16, 16, 3, 1, 0, true),
- TestParamType_convolution_gpu_block_layout(32, 16, 16, 3, 1, 0, true),
- TestParamType_convolution_gpu_block_layout(16, 32, 16, 3, 1, 0, true),
- TestParamType_convolution_gpu_block_layout(16, 16, 32, 3, 1, 0, true),
- TestParamType_convolution_gpu_block_layout(32, 16, 32, 3, 1, 0, true),
- TestParamType_convolution_gpu_block_layout(16, 32, 32, 3, 1, 0, true),
- TestParamType_convolution_gpu_block_layout(16, 64, 16, 3, 1, 0, true),
- TestParamType_convolution_gpu_block_layout(32, 32, 16, 3, 1, 0, true),
- TestParamType_convolution_gpu_block_layout(32, 32, 32, 3, 1, 0, true),
- TestParamType_convolution_gpu_block_layout(64, 64, 64, 3, 1, 0, true)),
+ TestParamType_convolution_gpu_block_layout(1, 3, 10, 1, 1, 0, false, 16),
+ TestParamType_convolution_gpu_block_layout(4, 4, 17, 3, 1, 0, false, 16),
+ TestParamType_convolution_gpu_block_layout(15, 17, 15, 3, 1, 0, false, 4),
+ TestParamType_convolution_gpu_block_layout(1, 17, 16, 1, 1, 0, false, 4),
+ TestParamType_convolution_gpu_block_layout(1, 17, 20, 3, 1, 0, false, 9),
+ TestParamType_convolution_gpu_block_layout(4, 15, 17, 1, 1, 0, false, 5),
+ TestParamType_convolution_gpu_block_layout(1, 2, 15, 3, 1, 0, true, 16),
+ TestParamType_convolution_gpu_block_layout(17, 2, 16, 3, 1, 0, true, 32),
+ TestParamType_convolution_gpu_block_layout(30, 2, 5, 1, 1, 0, true, 8),
+ TestParamType_convolution_gpu_block_layout(2, 1, 7, 3, 1, 0, true, 8),
+ TestParamType_convolution_gpu_block_layout(5, 16, 1, 1, 1, 0, true, 5),
+
+ TestParamType_convolution_gpu_block_layout(32, 4, 15, 1, 1, 0, false, 5),
+ TestParamType_convolution_gpu_block_layout(32, 2, 16, 3, 1, 0, false, 8),
+ TestParamType_convolution_gpu_block_layout(32, 4, 17, 3, 1, 0, false, 1),
+ TestParamType_convolution_gpu_block_layout(32, 17, 15, 1, 1, 0, false, 16),
+ TestParamType_convolution_gpu_block_layout(32, 15, 15, 1, 1, 0, true, 32),
+ TestParamType_convolution_gpu_block_layout(32, 2, 1, 3, 3, 0, true, 10),
+ TestParamType_convolution_gpu_block_layout(32, 17, 1, 1, 1, 0, true, 8),
+
+ TestParamType_convolution_gpu_block_layout(16, 1, 17, 1, 1, 0, true, 10),
+ TestParamType_convolution_gpu_block_layout(16, 3, 15, 1, 1, 0, false, 8),
+ TestParamType_convolution_gpu_block_layout(16, 1, 17, 3, 1, 0, false, 1),
+ TestParamType_convolution_gpu_block_layout(16, 15, 17, 1, 1, 0, false, 3),
+ TestParamType_convolution_gpu_block_layout(16, 16, 17, 3, 1, 0, false, 16),
+ TestParamType_convolution_gpu_block_layout(16, 17, 15, 3, 1, 0, false, 32),
+ TestParamType_convolution_gpu_block_layout(16, 17, 16, 1, 1, 0, true, 17),
+ TestParamType_convolution_gpu_block_layout(16, 3, 7, 3, 1, 0, true, 8),
+ TestParamType_convolution_gpu_block_layout(16, 5, 10, 1, 1, 0, true, 8),
+ TestParamType_convolution_gpu_block_layout(16, 17, 6, 3, 1, 0, true, 15),
+ TestParamType_convolution_gpu_block_layout(16, 16, 1, 1, 1, 0, true, 2),
+
+ TestParamType_convolution_gpu_block_layout(16, 32, 16, 3, 1, 0, false, 5),
+ TestParamType_convolution_gpu_block_layout(16, 16, 32, 3, 1, 0, false, 3),
+ TestParamType_convolution_gpu_block_layout(32, 16, 32, 3, 1, 0, false, 9),
+ TestParamType_convolution_gpu_block_layout(16, 32, 32, 3, 1, 0, false, 8),
+ TestParamType_convolution_gpu_block_layout(16, 64, 16, 3, 1, 0, false, 5),
+ TestParamType_convolution_gpu_block_layout(32, 32, 16, 3, 1, 0, false, 5),
+ TestParamType_convolution_gpu_block_layout(32, 32, 32, 3, 1, 0, false, 3),
+ TestParamType_convolution_gpu_block_layout(32, 32, 32, 1, 1, 0, false, 2),
+ TestParamType_convolution_gpu_block_layout(32, 16, 16, 3, 1, 0, true, 16),
+ TestParamType_convolution_gpu_block_layout(16, 32, 16, 3, 1, 0, true, 5),
+ TestParamType_convolution_gpu_block_layout(32, 16, 32, 3, 1, 0, true, 5),
+ TestParamType_convolution_gpu_block_layout(16, 64, 16, 3, 1, 0, true, 3),
+ TestParamType_convolution_gpu_block_layout(32, 32, 16, 3, 1, 0, true, 2),
+ TestParamType_convolution_gpu_block_layout(32, 32, 32, 3, 1, 0, true, 2),
+ TestParamType_convolution_gpu_block_layout(64, 64, 64, 3, 1, 0, true, 2)),
convolution_gpu_block_layout::PrintToStringParamName);
-TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp32)
+TEST_P(convolution_gpu_block_layout3D, bfzyx_bsv16_fsv16_fp32)
{
const auto& engine = get_test_engine();
const int batch_num = testing::get<0>(GetParam());
- const int input_xy = 5;
const int input_f = testing::get<1>(GetParam());
const int output_f = testing::get<2>(GetParam());
const int filter_xy = testing::get<3>(GetParam());
const int stride = testing::get<4>(GetParam());
const int output_padding = testing::get<5>(GetParam());
const bool with_bias = testing::get<6>(GetParam());
+ const int input_xy = testing::get<7>(GetParam());
const int input_offset = -(filter_xy / 2);
+ format input_format = format::b_fs_zyx_fsv16;
+ if (batch_num % 16 == 0)
+ input_format = format::bs_fs_zyx_bsv16_fsv16;
auto input_size = tensor(batch_num, input_f, input_xy, input_xy, 1);
auto input_data = generate_random_4d<float>(batch_num, input_f, input_xy, input_xy, 1, 10);
input_layout("input", input_mem.get_layout()),
data("weights", weights_mem));
- // Reorder input to bs_fs_yx_bsv16_fsv16
- topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f32, format::bs_fs_zyx_bsv16_fsv16, input_size }));
+ // Reorder input to correct format
+ topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f32, input_format, input_size }));
if (with_bias)
{
auto out_mem_bfyx = network.get_output("reorder_bfzyx").get_memory();
auto out_ptr_bfyx = out_mem_bfyx.pointer<float>();
- ASSERT_EQ(out_mem.get_layout().format, format::bs_fs_zyx_bsv16_fsv16);
+ blockedFormatZeroCheck<float>(out_mem);
+
+ ASSERT_EQ(out_mem.get_layout().format, input_format);
auto flatten_ref = flatten_4d(format::bfyx, reference_result);
return;
}
}
+
}
-TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp16)
+TEST_P(convolution_gpu_block_layout3D, bfzyx_bsv16_fsv16_fp16)
{
const auto& engine = get_test_engine();
}
const int batch_num = testing::get<0>(GetParam());
- const int input_xy = 5;
const int input_f = testing::get<1>(GetParam());
const int output_f = testing::get<2>(GetParam());
const int filter_xy = testing::get<3>(GetParam());
const int stride = testing::get<4>(GetParam());
const int output_padding = testing::get<5>(GetParam());
const bool with_bias = testing::get<6>(GetParam());
+ const int input_xy = testing::get<7>(GetParam());
const int input_offset = -(filter_xy / 2);
-
- if (batch_num % 32 != 0)
- {
- std::cout << "[ SKIPPED ] The test is skipped (for fp16 batch should be multiple of 32)." << std::endl;
- EXPECT_EQ(1, 1);
- return;
- }
+ format input_format = format::b_fs_zyx_fsv16;
+ if (batch_num % 32 == 0)
+ input_format = format::bs_fs_zyx_bsv16_fsv16;
auto input_size = tensor(batch_num, input_f, input_xy, input_xy, 1);
auto input_data = generate_random_4d<FLOAT16>(batch_num, input_f, input_xy, input_xy, 0, 1);
input_layout("input", input_mem.get_layout()),
data("weights", weights_mem));
- // Reorder input to bs_fs_zyx_bsv16_fsv16
- topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f16, format::bs_fs_zyx_bsv16_fsv16, input_size }));
+ // Reorder input to correct format
+ topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f16, input_format, input_size }));
if (with_bias)
{
auto out_mem_bfyx = network.get_output("reorder_bfzyx").get_memory();
auto out_ptr_bfyx = out_mem_bfyx.pointer<FLOAT16>();
- ASSERT_EQ(out_mem.get_layout().format, format::bs_fs_zyx_bsv16_fsv16);
+ blockedFormatZeroCheck<FLOAT16>(out_mem);
+
+ ASSERT_EQ(out_mem.get_layout().format, input_format);
auto flatten_ref = flatten_4d(format::bfyx, reference_result);
}
}
-TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp32_fused_ops)
+TEST_P(convolution_gpu_block_layout3D, bfzyx_bsv16_fsv16_fp32_fused_ops)
{
const auto& engine = get_test_engine();
const int batch_num = testing::get<0>(GetParam());
- const int input_xy = 5;
const int input_f = testing::get<1>(GetParam());
const int output_f = testing::get<2>(GetParam());
const int filter_xy = testing::get<3>(GetParam());
const int stride = testing::get<4>(GetParam());
const int output_padding = testing::get<5>(GetParam());
const bool with_bias = testing::get<6>(GetParam());
+ const int input_xy = testing::get<7>(GetParam());
const int input_offset = -(filter_xy / 2);
+ format input_format = format::b_fs_zyx_fsv16;
+ if (batch_num % 16 == 0)
+ input_format = format::bs_fs_zyx_bsv16_fsv16;
auto input_size = tensor(batch_num, input_f, input_xy, input_xy, 1);
auto input_data = generate_random_4d<float>(batch_num, input_f, input_xy, input_xy, 1, 10);
input_layout("input", input_mem.get_layout()),
data("weights", weights_mem));
- // Reorder input to bs_fs_yx_bsv16_fsv16
- topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f32, format::bs_fs_zyx_bsv16_fsv16, input_size }));
+ // Reorder input to correct format
+ topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f32, input_format, input_size }));
if (with_bias)
{
auto out_mem_bfyx = network.get_output("reorder_bfzyx").get_memory();
auto out_ptr_bfyx = out_mem_bfyx.pointer<float>();
- ASSERT_EQ(out_mem.get_layout().format, format::bs_fs_zyx_bsv16_fsv16);
+ blockedFormatZeroCheck<float>(out_mem);
+
+ ASSERT_EQ(out_mem.get_layout().format, input_format);
auto flatten_ref = flatten_4d(format::bfyx, reference_result);
}
}
+INSTANTIATE_TEST_CASE_P(convolution_gpu_block,
+ convolution_gpu_block_layout,
+ ::testing::Values(
+ TestParamType_convolution_gpu_block_layout(16, 64, 64, 1, 1, 0, false, 0),
+ TestParamType_convolution_gpu_block_layout(16, 16, 16, 3, 1, 0, false, 0),
+ TestParamType_convolution_gpu_block_layout(32, 16, 16, 3, 1, 0, false, 0),
+ TestParamType_convolution_gpu_block_layout(16, 32, 16, 3, 1, 0, false, 0),
+ TestParamType_convolution_gpu_block_layout(16, 16, 32, 3, 1, 0, false, 0),
+ TestParamType_convolution_gpu_block_layout(32, 16, 32, 3, 1, 0, false, 0),
+ TestParamType_convolution_gpu_block_layout(16, 32, 32, 3, 1, 0, false, 0),
+ TestParamType_convolution_gpu_block_layout(16, 64, 16, 3, 1, 0, false, 0),
+ TestParamType_convolution_gpu_block_layout(32, 32, 16, 3, 1, 0, false, 0),
+ TestParamType_convolution_gpu_block_layout(32, 32, 32, 3, 1, 0, false, 0),
+ TestParamType_convolution_gpu_block_layout(32, 32, 32, 1, 1, 0, false, 0),
+ TestParamType_convolution_gpu_block_layout(32, 64, 64, 1, 1, 0, false, 0),
+ TestParamType_convolution_gpu_block_layout(16, 64, 64, 1, 1, 0, true, 0),
+ TestParamType_convolution_gpu_block_layout(16, 16, 16, 3, 1, 0, true, 0),
+ TestParamType_convolution_gpu_block_layout(32, 16, 16, 3, 1, 0, true, 0),
+ TestParamType_convolution_gpu_block_layout(16, 32, 16, 3, 1, 0, true, 0),
+ TestParamType_convolution_gpu_block_layout(16, 16, 32, 3, 1, 0, true, 0),
+ TestParamType_convolution_gpu_block_layout(32, 16, 32, 3, 1, 0, true, 0),
+ TestParamType_convolution_gpu_block_layout(16, 32, 32, 3, 1, 0, true, 0),
+ TestParamType_convolution_gpu_block_layout(16, 64, 16, 3, 1, 0, true, 0),
+ TestParamType_convolution_gpu_block_layout(32, 32, 16, 3, 1, 0, true, 0),
+ TestParamType_convolution_gpu_block_layout(32, 32, 32, 3, 1, 0, true, 0),
+ TestParamType_convolution_gpu_block_layout(64, 64, 64, 3, 1, 0, true, 0)),
+ convolution_gpu_block_layout::PrintToStringParamName);
+
TEST_P(convolution_gpu_block_layout, bfyx_bsv16_fsv16_fp32)
{
const auto& engine = get_test_engine();