#include "include/mmad.cl"
#include "include/data_types.cl"
+#define TYPE_N_(type, n) type##n
+#define TYPE_N(type, n) TYPE_N_(type, n)
#define AS_TYPE_N_(type, n, x) as_##type##n(x)
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
+#define INPUT0_TYPE_4 TYPE_N(INPUT0_TYPE, 4)
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
+#if INPUT0_PAD_BEFORE_SIZE_X != 0 || \
+ INPUT0_PAD_BEFORE_SIZE_Y != 0 || \
+ INPUT0_PAD_BEFORE_SIZE_Z != 0
+ #define NON_ZERO_INPUT0_PAD_BEFORE
+#endif
+
+#if !defined COMPENSATION_TERM || \
+ (defined COMPENSATION_TERM && defined NON_ZERO_INPUT0_PAD_BEFORE)
+ #define SHOULD_BALANCE_COMPENSATION
+#endif
+
+#if defined ASYMMETRIC_DATA_QUANTIZATION && defined SHOULD_BALANCE_COMPENSATION
+ #define SHOULD_USE_DATA_ZP
+#endif
+
+#if defined ASYMMETRIC_DATA_QUANTIZATION && \
+ defined ASYMMETRIC_WEIGHTS_QUANTIZATION && \
+ defined SHOULD_BALANCE_COMPENSATION
+ #define SHOULD_USE_DATA_AND_WEIGHTS_ZP
+#endif
+
+#ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
+ #define ACCUMULATOR_TYPE_4 TYPE_N(ACCUMULATOR_TYPE, 4)
+#endif
+
+#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
+ #define FILTER_TYPE_16 TYPE_N(FILTER_TYPE, 16)
+#endif
+
#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
#if BIAS_TERM
const __global BIAS_TYPE *biases,
#endif
+#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
+ const __global WEIGHTS_ZERO_POINTS_TYPE *weights_zp,
+#endif
+#ifdef ASYMMETRIC_DATA_QUANTIZATION
+ const __global ACTIVATIONS_ZERO_POINTS_TYPE *activations_zp,
+#endif
+#ifdef COMPENSATION_TERM
+ const __global COMPENSATION_TYPE *compensation,
+#endif
#if HAS_FUSED_OPS_DECLS
FUSED_OPS_DECLS,
#endif
uint4 input_val[IN_BLOCK_DEPTH][IN_BLOCK_HEIGHT][CEIL_DIV(IN_BLOCK_WIDTH, SIMD)];
+#ifdef SHOULD_USE_DATA_ZP
+ uint data_zp_idx = g * FILTER_IFM_NUM + in_f_start;
+ uint4 data_zp_val;
+#endif
+
+#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
+ uint4 weights_zp_val[OFM_BLOCKS_PER_SIMD];
+ __attribute__((opencl_unroll_hint))
+ for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+ weights_zp_val[ofb] = as_uint4((FILTER_TYPE_16)weights_zp[out_f + ofb * FSV]);
+ }
+ #if FILTER_IFM_NUM % FSV != 0
+ uint4 weights_zp_vec_partial[OFM_BLOCKS_PER_SIMD];
+ __attribute__((opencl_unroll_hint))
+ for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+ weights_zp_vec_partial[ofb] = weights_zp_val[ofb];
+ FILTER_TYPE* wzp_p = (FILTER_TYPE*)&weights_zp_vec_partial[ofb];
+ __attribute__((opencl_unroll_hint))
+ for (uint f = FILTER_IFM_NUM % FSV; f < FSV; f++) {
+ wzp_p[f] = 0;
+ }
+ }
+ #endif
+#endif
+
__attribute__((opencl_unroll_hint(1)))
for (uint k = 0; k < CEIL_DIV(FILTER_IFM_NUM, FSV) / FEATURE_SLM_SPLIT; k++) {
+ #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
+ #if FILTER_IFM_NUM % FSV != 0
+ if (in_f_start + (k + 1) * FSV >= ALIGN(FILTER_IFM_NUM, FSV)) {
+ __attribute__((opencl_unroll_hint))
+ for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+ weights_zp_val[ofb] = weights_zp_vec_partial[ofb];
+ }
+ }
+ #endif
+ #endif
+
+ #ifdef SHOULD_USE_DATA_ZP
+ #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
+ data_zp_val = as_uint4(vload16(0, activations_zp + data_zp_idx));
+ #else
+ data_zp_val = vload4(0, (__global uint *)(activations_zp + data_zp_idx));
+ #endif
+ #endif
+
+ #ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
+ ACCUMULATOR_TYPE_4 dotProdAZPxWZP[OFM_BLOCKS_PER_SIMD];
+ __attribute__((opencl_unroll_hint))
+ for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+ dotProdAZPxWZP[ofb] = 0;
+ __attribute__((opencl_unroll_hint))
+ for (uint ive = 0; ive < 4; ive++) {
+ dotProdAZPxWZP[ofb][ive] = TO_ACCUMULATOR_TYPE(
+ IMAD(dotProdAZPxWZP[ofb][ive],
+ AS_INPUT0_TYPE_4(data_zp_val[ive]),
+ AS_FILTER_TYPE_4(weights_zp_val[ofb][ive])));
+ }
+ }
+ #endif
+
__attribute__((opencl_unroll_hint(1)))
for (uint fzn = 0; fzn < FILTER_SIZE_Z / FILTER_SIZE_Z_UNROLL; fzn++) {
__attribute__((opencl_unroll_hint(1)))
__attribute__((opencl_unroll_hint))
for (uint ixb = 0; ixb < CEIL_DIV(IN_BLOCK_WIDTH, SIMD); ++ixb) {
uint input_idx = input_start_idx + izb * INPUT0_Z_PITCH * FSV + iyb * INPUT0_Y_PITCH * FSV + ixb * SIMD * FSV;
-
+ #ifdef SHOULD_USE_DATA_ZP
+ const int y_idx = input_y + fyn * DILATION_SIZE_Y + iyb;
+ const int z_idx = input_z + fzn * DILATION_SIZE_Z + izb;
+ #endif
if (ixb != CEIL_DIV(IN_BLOCK_WIDTH, SIMD) - 1) {
+ #ifdef SHOULD_USE_DATA_ZP
+ const int x_idx = input_x + ixb * SIMD + get_sub_group_local_id();
+ const bool input_on_padding = (((x_idx < 0) || (x_idx >= INPUT0_SIZE_X)) ||
+ ((y_idx < 0) || (y_idx >= INPUT0_SIZE_Y)) ||
+ ((z_idx < 0) || (z_idx >= INPUT0_SIZE_Z)));
+ #endif
+
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
if (in_f_offset == 0) {
#endif
- input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * FSV));
+ #ifdef SHOULD_USE_DATA_ZP
+ if (input_on_padding) {
+ input_val[izb][iyb][ixb] = data_zp_val;
+ } else {
+ #endif
+ input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * FSV));
+ #ifdef SHOULD_USE_DATA_ZP
+ }
+ #endif
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
} else {
INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
+ #ifdef SHOULD_USE_DATA_ZP
+ INPUT0_TYPE* input_zp_int8_arr = (INPUT0_TYPE*) &data_zp_val;
+ #endif
__attribute__((opencl_unroll_hint(FSV)))
for (uint v = 0; v < FSV; v++) {
- if (v + in_f_offset < FSV) {
- input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v];
- } else {
- const uint addr = input_idx + get_sub_group_local_id() * FSV + v +
- ((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) *
- (INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) *
- (INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV;
- input_int8_arr[v] = conv_input[addr];
- }
+ #ifdef SHOULD_USE_DATA_ZP
+ if (input_on_padding) {
+ input_int8_arr[v] = input_zp_int8_arr[v];
+ } else {
+ #endif
+ if (v + in_f_offset < FSV) {
+ input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v];
+ } else {
+ const uint addr = input_idx + get_sub_group_local_id() * FSV + v +
+ ((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) *
+ (INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) *
+ (INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV;
+ input_int8_arr[v] = conv_input[addr];
+ }
+ #ifdef SHOULD_USE_DATA_ZP
+ }
+ #endif
}
}
#endif
} else {
+ #ifdef SHOULD_USE_DATA_ZP
+ const int x_idx = input_x + ixb * SIMD + tmp;
+ const bool input_on_padding = (((x_idx < 0) || (x_idx >= INPUT0_SIZE_X)) ||
+ ((y_idx < 0) || (y_idx >= INPUT0_SIZE_Y)) ||
+ ((z_idx < 0) || (z_idx >= INPUT0_SIZE_Z)));
+ #endif
+
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
if (in_f_offset == 0) {
#endif
- input_val[izb][iyb][ixb] = vload4(0, (__global uint*)(conv_input + input_idx + tmp * FSV));
+ #ifdef SHOULD_USE_DATA_ZP
+ if (input_on_padding) {
+ input_val[izb][iyb][ixb] = data_zp_val;
+ } else {
+ #endif
+ input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + tmp * FSV));
+ #ifdef SHOULD_USE_DATA_ZP
+ }
+ #endif
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
} else {
INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
+ #ifdef SHOULD_USE_DATA_ZP
+ INPUT0_TYPE* input_zp_int8_arr = (INPUT0_TYPE*) &data_zp_val;
+ #endif
__attribute__((opencl_unroll_hint(FSV)))
for (uint v = 0; v < FSV; v++) {
- if (v + in_f_offset < FSV) {
- input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v];
- } else {
- const uint addr = input_idx + tmp * FSV + v +
- ((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) *
- (INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) *
- (INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV;
- input_int8_arr[v] = conv_input[addr];
- }
+ #ifdef SHOULD_USE_DATA_ZP
+ if (input_on_padding) {
+ input_int8_arr[v] = input_zp_int8_arr[v];
+ } else {
+ #endif
+ if (v + in_f_offset < FSV) {
+ input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v];
+ } else {
+ const uint addr = input_idx + tmp * FSV + v +
+ ((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) *
+ (INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) *
+ (INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV;
+ input_int8_arr[v] = conv_input[addr];
+ }
+ #ifdef SHOULD_USE_DATA_ZP
+ }
+ #endif
}
}
#endif
for (uint ive = 0; ive < 4; ive++) {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+ #ifdef SHOULD_USE_DATA_ZP
+ ACCUMULATOR_TYPE dotProdAZPxW = 0;
+ dotProdAZPxW = TO_ACCUMULATOR_TYPE(
+ IMAD(dotProdAZPxW,
+ AS_INPUT0_TYPE_4(data_zp_val[ive]),
+ AS_FILTER_TYPE_4(weights_val[ofb][ive])));
+ #endif
+
__attribute__((opencl_unroll_hint(OUT_BLOCK_DEPTH)))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
const uint shuffle_wi = x_block_idx % SIMD;
const uint shuffle_idx = x_block_idx / SIMD;
+ INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[z_block_idx][y_block_idx][shuffle_idx][ive],
+ shuffle_wi));
+
dotProd[ofb][od][oh][ow] = TO_ACCUMULATOR_TYPE(
IMAD(dotProd[ofb][od][oh][ow],
- AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[z_block_idx][y_block_idx][shuffle_idx][ive],
- shuffle_wi)),
+ inputs,
AS_FILTER_TYPE_4(weights_val[ofb][ive])));
+
+ #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
+ ACCUMULATOR_TYPE dotProdAxWZP = 0;
+ dotProdAxWZP = TO_ACCUMULATOR_TYPE(
+ IMAD(dotProdAxWZP,
+ inputs,
+ AS_FILTER_TYPE_4(weights_zp_val[ofb][ive])));
+ dotProd[ofb][od][oh][ow] -= dotProdAxWZP;
+ #endif
+
+ #if !defined COMPENSATION_TERM && defined ASYMMETRIC_DATA_QUANTIZATION
+ dotProd[ofb][od][oh][ow] -= dotProdAZPxW;
+ #endif
+
+ #if (!defined COMPENSATION_TERM && \
+ defined ASYMMETRIC_DATA_QUANTIZATION && \
+ defined ASYMMETRIC_WEIGHTS_QUANTIZATION)
+ dotProd[ofb][od][oh][ow] += dotProdAZPxWZP[ofb][ive];
+ #endif
}
}
}
input_start_idx += INPUT0_FEATURE_PITCH * FSV * FEATURE_SLM_SPLIT - (FILTER_SIZE_Z / FILTER_SIZE_Z_UNROLL) * DILATION_SIZE_Z * INPUT0_Z_PITCH * FSV;
filter_idx += FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z * (FEATURE_SLM_SPLIT - 1);
+
+ #ifdef SHOULD_USE_DATA_ZP
+ data_zp_idx += FSV;
+ #endif
}
#if FEATURE_SLM_SPLIT != 1
}
#endif
+#ifdef COMPENSATION_TERM
+ COMPENSATION_TYPE comp[OFM_VALUES_PER_WI];
+ __attribute__((opencl_unroll_hint))
+ for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
+ comp[ofb] = compensation[out_f + ofb * SIMD];
+ }
+#endif
+
ACTIVATION_TYPE dequantized[OFM_VALUES_PER_WI][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
#if BIAS_TERM
dequantized[ofb][od][oh][ow] += bias[ofb];
#endif
+#ifdef COMPENSATION_TERM
+ dequantized[ofb][od][oh][ow] += comp[ofb];
+#endif
}
}
}
#endif
}
-#undef AS_INPUT0_TYPE_4
+#undef TYPE_N_
+#undef TYPE_N
#undef AS_TYPE_N
#undef AS_TYPE_N_
+
+#undef INPUT0_TYPE_4
+#undef AS_INPUT0_TYPE_4
+
+#ifdef NON_ZERO_INPUT0_PAD_BEFORE
+ #undef NON_ZERO_INPUT0_PAD_BEFORE
+#endif
+
+#ifdef SHOULD_BALANCE_COMPENSATION
+ #undef SHOULD_BALANCE_COMPENSATION
+#endif
+
+#ifdef SHOULD_USE_DATA_ZP
+ #undef SHOULD_USE_DATA_ZP
+#endif
+
+#ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
+ #undef SHOULD_USE_DATA_AND_WEIGHTS_ZP
+#endif
+
+#ifdef ACCUMULATOR_TYPE_4
+#undef ACCUMULATOR_TYPE_4
+#endif
+
+#ifdef FILTER_TYPE_16
+#undef FILTER_TYPE_16
+#endif
+
#undef AS_FILTER_TYPE_4
#undef CEIL_DIV
int, // 7 - Kernel sizeZ
int, // 8 - Groups number
int, // 9 - Stride
- int, // 10 - Batch
- format, // 11 - Input data format
- std::string>; // 12 - Implementation name
+ int, // 10 - Batch
+ bool, // 11 - Zero points for activations
+ bool, // 12 - Zero points for weights
+ bool, // 13 - Compensation
+ format, // 14 - Input data format
+ std::string>; // 15 - Implementation name
using TestParamType_general_convolution_gpu = ::testing::tuple< int, // 0 - Input X size
int, // 1 - Input Y size
"_groups" + std::to_string(testing::get<8>(param_info.param)) +
"_stride" + std::to_string(testing::get<9>(param_info.param)) +
"_batch" + std::to_string(testing::get<10>(param_info.param)) +
- "_format" + std::to_string(testing::get<11>(param_info.param));
+ "_data_zp" + std::to_string(testing::get<11>(param_info.param)) +
+ "_weights_zp" + std::to_string(testing::get<12>(param_info.param)) +
+ "_comp" + std::to_string(testing::get<13>(param_info.param)) +
+ "_format" + std::to_string(testing::get<14>(param_info.param));
- if (testing::get<12>(param_info.param) != "") {
- res += "_impl_" + testing::get<12>(param_info.param);
+ if (testing::get<15>(param_info.param) != "") {
+ res += "_impl_" + testing::get<15>(param_info.param);
}
return res;
::testing::Values(
// Input X size, Input Y size, Input Z size, Input features, Output features,
// Kernel size X, Kernel size Y, Kernel size Z, Groups number, Stride, Batch,
+ // Activation zero points, Weights zero points, Compensation,
// Input data format, Implementation name
// Format: b_fs_yx_fsv4
- TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv4, ""),
- TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv4, ""),
- TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, format::b_fs_yx_fsv4, ""),
- TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, format::b_fs_yx_fsv4, ""),
- TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, format::b_fs_yx_fsv4, ""),
- TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, format::b_fs_yx_fsv4, ""),
- TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, format::b_fs_yx_fsv4, ""),
- TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv4, ""),
+ TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""),
+ TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""),
+ TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, false, false, false, format::b_fs_yx_fsv4, ""),
+ TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""),
+ TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, false, false, false, format::b_fs_yx_fsv4, ""),
+ TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, false, false, false, format::b_fs_yx_fsv4, ""),
+ TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, false, false, false, format::b_fs_yx_fsv4, ""),
+ TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, false, false, false, format::b_fs_yx_fsv4, ""),
// Format: b_fs_yx_fsv16
- TestParamType_grouped_convolution_gpu(12, 12, 1, 96, 96, 3, 3, 1, 32, 1, 1, format::b_fs_yx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 16, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(7, 7, 1, 8, 4, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(5, 5, 1, 34, 12, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(8, 8, 1, 34, 24, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(2, 2, 1, 12, 12, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(3, 3, 1, 8, 8, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 2, 4, format::b_fs_yx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, format::b_fs_yx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, format::b_fs_yx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, format::b_fs_yx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, format::b_fs_yx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(12, 12, 1, 96, 96, 3, 3, 1, 32, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 16, 3, 3, 1, 2, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(7, 7, 1, 8, 4, 3, 3, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(5, 5, 1, 34, 12, 3, 3, 1, 2, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(8, 8, 1, 34, 24, 3, 3, 1, 2, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(2, 2, 1, 12, 12, 3, 3, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(3, 3, 1, 8, 8, 3, 3, 1, 2, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 2, 4, true, true, true, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, true, true, true, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, true, true, true, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, true, true, true, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, true, true, true, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, true, true, true, format::b_fs_yx_fsv16, ""),
// Format: b_fs_zyx_fsv16
- TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 17, 3, 3, 3, 1, 1, 1, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 16, 3, 3, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(4, 4, 4, 8, 4, 2, 2, 2, 2, 1, 4, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(8, 8, 8, 16, 16, 4, 4, 4, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(17, 17, 17, 32, 96, 3, 3, 3, 2, 2, 2, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(16, 16, 16, 8, 48, 2, 2, 2, 2, 2, 1, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(3, 3, 3, 48, 96, 2, 2, 2, 2, 8, 1, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(6, 6, 6, 8, 26, 3, 3, 3, 2, 4, 1, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(7, 5, 3, 51, 99, 3, 3, 3, 3, 1, 1, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(8, 6, 4, 32, 64, 2, 2, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(33, 6, 4, 16, 32, 4, 3, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(33, 1, 1, 30, 62, 1, 1, 1, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(2, 1, 1, 18, 32, 3, 1, 1, 2, 2, 1, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 256, 2, 1, 2, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 512, 2, 1, 3, 16, 1, 1, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(1, 3, 1, 18, 2, 1, 3, 1, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
- TestParamType_grouped_convolution_gpu(2, 3, 4, 3, 18, 3, 3, 3, 1, 1, 1, format::b_fs_zyx_fsv16, "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32"),
- TestParamType_grouped_convolution_gpu(79, 224, 224, 3, 64, 3, 3, 3, 1, 2, 1, format::b_fs_zyx_fsv16, "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32")
+ TestParamType_grouped_convolution_gpu(7, 5, 3, 51, 99, 3, 3, 3, 3, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(8, 6, 4, 32, 64, 2, 2, 2, 2, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(33, 6, 4, 16, 32, 4, 3, 2, 2, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(33, 1, 1, 30, 62, 1, 1, 1, 2, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(2, 1, 1, 18, 32, 3, 1, 1, 2, 2, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, false, false, false, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, false, true, false, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, true, false, false, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, true, true, false, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, true, false, true, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, false, false, false, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, false, true, false, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, true, false, false, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, true, true, false, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, true, false, true, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 256, 2, 1, 2, 4, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 512, 2, 1, 3, 16, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(1, 3, 1, 18, 2, 1, 3, 1, 2, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(2, 3, 4, 3, 18, 3, 3, 3, 1, 1, 1, false, false, false, format::b_fs_zyx_fsv16, "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32"),
+ TestParamType_grouped_convolution_gpu(79, 224, 224, 3, 64, 3, 3, 3, 1, 2, 1, false, false, false, format::b_fs_zyx_fsv16, "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32")
),
convolution_grouped_gpu::PrintToStringParamName);
groups = testing::get<8>(GetParam()),
stride = testing::get<9>(GetParam()),
batch_num = testing::get<10>(GetParam()),
- output_padding = 0,
input_offset_z = (filter_z - 1) / 2,
input_offset_y = (filter_y - 1) / 2,
input_offset_x = (filter_x - 1) / 2;
- auto input_data_format = testing::get<11>(GetParam());
- auto impl_name = testing::get<12>(GetParam());
+ const auto has_input_zp = testing::get<11>(GetParam());
+ const auto has_weights_zp = testing::get<12>(GetParam());
+ const auto has_comp = testing::get<13>(GetParam());
+ const auto input_data_format = testing::get<14>(GetParam());
+ const auto impl_name = testing::get<15>(GetParam());
+
+ // can use compensation term only if data zero points are available
+ ASSERT_TRUE(has_input_zp || !has_comp);
auto num_in_spatial_dims = input_data_format.spatial_num();
auto input_size = tensor(batch(batch_num), feature(input_f), spatial(input_x, input_y, input_z));
- auto input_rnd = generate_random_5d<uint8_t>(batch_num, input_f, input_z, input_y, input_x, 0, 255);
+ auto input_rnd = generate_random_5d<int8_t>(batch_num, input_f, input_z, input_y, input_x, -127, 127);
- auto input_lay = layout(data_types::u8, format::bfzyx, input_size);
+ auto input_lay = layout(data_types::i8, format::bfzyx, input_size);
if (num_in_spatial_dims == 2) {
- input_lay = layout(data_types::u8, format::bfyx, input_size);
+ input_lay = layout(data_types::i8, format::bfyx, input_size);
}
- std::vector<uint8_t> input_flat(input_lay.get_linear_size());
+ std::vector<int8_t> input_flat(input_lay.get_linear_size());
for (int b = 0; b < batch_num; b++)
for (int f = 0; f < input_f; f++)
for (int z = 0; z < input_z; z++)
auto input = memory::allocate(engine, input_lay);
set_values(input, input_flat);
+ auto input_zp_rnd = std::vector<int8_t>(input_f);
+ auto input_zp_prim_name = std::vector<primitive_id>(0);
+ if (has_input_zp) {
+ input_zp_rnd = generate_random_1d<int8_t>(input_f, -127, 127);
+ input_zp_prim_name = { "input_zp" };
+ }
+ auto input_zp_lay = layout(data_types::i8, format::bfyx, tensor(feature(input_f)));
+ auto input_zp = memory::allocate(engine, input_zp_lay);
+ set_values(input_zp, input_zp_rnd);
+
auto weights_size = tensor(group(groups), batch(output_f / groups), feature(input_f / groups), spatial(filter_x, filter_y, filter_z));
VVVVVVF<int8_t> weights_rnd = generate_random_6d<int8_t>(groups, output_f / groups, input_f / groups, filter_z, filter_y, filter_x, -127, 127);
auto weights = memory::allocate(engine, weights_lay);
set_values(weights, weights_flat);
+ auto weights_zp_rnd = std::vector<int8_t>(output_f);
+ auto weights_zp_prim_name = std::vector<primitive_id>(0);
+ if (has_weights_zp) {
+ weights_zp_rnd = generate_random_1d<int8_t>(output_f, -127, 127);
+ weights_zp_prim_name = { "weights_zp" };
+ }
+ auto weights_zp_lay = layout(data_types::i8, format::bfyx, tensor(batch(output_f)));
+ auto weights_zp = memory::allocate(engine, weights_zp_lay);
+ set_values(weights_zp, weights_zp_rnd);
+
VVVVVF<float> expected_result(batch_num, VVVVF<float>(output_f));
// Calculate reference values without bias
int f_begin = gi * input_f / groups;
int f_end = gi * input_f / groups + input_f / groups;
- expected_result[bi][ofi + gi * output_f / groups] = reference_convolve<uint8_t, float, int8_t>(
- input_rnd[bi], weights_rnd[gi][ofi], // input, weights
- stride, stride, stride, // strides
- 0, // bias
- 1, 1, 1, // dilation
- input_offset_z, input_offset_y, input_offset_x, // input padding
- 0, 0, 0, // output_padding
- f_begin, f_end, // f_begin, f_end
- false, // depthwise
- grouped); // grouped
+ expected_result[bi][ofi + gi * output_f / groups] = reference_convolve<int8_t, float, int8_t>(
+ input_rnd[bi], weights_rnd[gi][ofi], // input, weights
+ stride, stride, stride, // strides
+ 0, // bias
+ 1, 1, 1, // dilation
+ input_offset_z, input_offset_y, input_offset_x, // input padding
+ 0, 0, 0, // output_padding
+ f_begin, f_end, // f_begin, f_end
+ false, // depthwise
+ grouped, // grouped
+ input_zp_rnd, // input zero points
+ weights_zp_rnd[gi * (int)weights_rnd[0].size() + ofi]); // weights zero points
+ }
+
+ auto ref_conv_out_size = tensor(batch(expected_result.size()),
+ feature(expected_result[0].size()),
+ spatial(expected_result[0][0][0][0].size(),
+ expected_result[0][0][0].size(),
+ expected_result[0][0].size()));
+
+ auto comp_val = std::vector<float>(output_f);
+ auto comp_prim_name = std::vector<primitive_id>(0);
+ if (has_comp) {
+ for (int g = 0; g < groups; g++) {
+ for (int oc = 0; oc < output_f / groups; oc++) {
+ float c = 0.f;
+ for (int ic = 0; ic < input_f / groups; ic++) {
+ for (int zi = 0; zi < filter_z; zi++) {
+ for (int yi = 0; yi < filter_y; yi++) {
+ for (int xi = 0; xi < filter_x; xi++) {
+ int azp_idx = g*(input_f / groups) + ic;
+ int wzp_idx = g*(output_f / groups) + oc;
+ c += weights_rnd[g][oc][ic][zi][yi][xi] * input_zp_rnd[azp_idx];
+ if (has_weights_zp) {
+ c -= input_zp_rnd[azp_idx] * weights_zp_rnd[wzp_idx];
+ }
+ }
+ }
+ }
+ }
+
+ comp_val[g*(output_f / groups) + oc] = -c;
}
+ }
+ comp_prim_name = { "compensation" };
+ }
+ auto comp_lay = layout(data_types::f32, format::bfyx, tensor(batch(output_f)));
+ auto comp = memory::allocate(engine, comp_lay);
+ set_values(comp, comp_val);
+
+ auto stride_tensor = tensor(batch(1), feature(1), spatial(stride, stride, stride, 1));
+ if (num_in_spatial_dims == 2) {
+ stride_tensor = tensor(batch(1), feature(1), spatial(stride, stride, 1, 1));
+ }
topology topology(input_layout("input", input.get_layout()),
data("weights", weights),
- reorder("input_fsv", "input", {data_types::u8, input_data_format, input_size}),
+ reorder("input_fsv", "input", {data_types::i8, input_data_format, input_size}),
convolution("conv",
"input_fsv",
{"weights"},
+ std::vector<primitive_id>(0),
+ weights_zp_prim_name,
+ input_zp_prim_name,
+ comp_prim_name,
groups,
- tensor(batch(1), feature(1), spatial(stride, stride, stride, 1)),
+ data_types::f32,
+ stride_tensor,
tensor(batch(0), feature(0), spatial(-input_offset_x, -input_offset_y, -input_offset_z, 0)),
tensor(batch(1), feature(1), spatial(1, 1, 1, 1)),
- padding({0, 0, output_padding, output_padding, output_padding}, 0.f)));
+ ref_conv_out_size),
+ reorder("out", "conv", {data_types::f32, format::bfzyx, ref_conv_out_size}));
+
+ if (has_input_zp)
+ topology.add(data(input_zp_prim_name[0], input_zp));
+
+ if (has_weights_zp)
+ topology.add(data(weights_zp_prim_name[0], weights_zp));
+
+ if (has_comp)
+ topology.add(data(comp_prim_name[0], comp));
build_options options;
options.set_option(build_option::optimize_data(true));
implementation_desc conv_impl = {input_data_format, impl_name};
options.set_option(build_option::force_implementations({{"conv", conv_impl}}));
- network network(engine, topology, options);
+ cldnn::network network(engine, topology, options);
network.set_input_data("input", input);
network.execute();
.smoke_test_params(format::b_fs_yx_fsv32, false, true)
.smoke_test_params(format::b_fs_yx_fsv32, true, false)
.smoke_test_params(format::b_fs_yx_fsv32, false, false, true)
- .smoke_test_params(format::b_fs_yx_fsv16, false, false, true)
.smoke_test_params(format::b_fs_yx_fsv16)
+ .smoke_test_params(format::b_fs_yx_fsv16, true, true)
+ .smoke_test_params(format::b_fs_yx_fsv16, false, true)
+ .smoke_test_params(format::b_fs_yx_fsv16, true, false)
+ .smoke_test_params(format::b_fs_yx_fsv16, false, false, true)
.bs_test_params(format::bs_fs_yx_bsv16_fsv16)
),
to_string_convolution_all_params