#define FEATURE_SLICE_SIZE 16
+// OUTPUT_X_BLOCK_SIZE is one of 2, 4, 8
+#define UNIT_BLOCK_WRITEN(ptr, offset, val) CAT(UNIT_BLOCK_WRITE, OUTPUT_X_BLOCK_SIZE)(ptr, offset, val)
+
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
KERNEL(convolution_bfyx_to_bfyx_f16)(
vec_t dst = UNIT_VAL_ZERO;
#endif
- UNIT_TYPE line_cache[3 * INPUT_BLOCK_SIZE];
- for (int ic = 0; ic < 3; ic++)
+ UNIT_TYPE line_cache[INPUT0_FEATURE_NUM * INPUT_BLOCK_SIZE];
+ for (int ic = 0; ic < INPUT0_FEATURE_NUM; ic++)
{
__attribute__((opencl_unroll_hint(INPUT_BLOCK_SIZE)))
for (int i = 0; i < INPUT_BLOCK_SIZE; i++)
__attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
for (int kw = 0; kw < FILTER_SIZE_X; kw++)
{
- MAKE_VECTOR_TYPE(UNIT_TYPE, 2) wei = UNIT_BLOCK_READ2(weights, filter_offset +
- kh * filter_y_pitch +
- kw * filter_x_pitch);
- UNIT_TYPE wei2 = UNIT_BLOCK_READ(weights, filter_offset +
- kh * filter_y_pitch +
- kw * filter_x_pitch +
- 2 * filter_isv_pitch);
+ uint offset = filter_offset + kh * filter_y_pitch + kw * filter_x_pitch;
+
+ UNIT_TYPE wei[INPUT0_FEATURE_NUM];
+ __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM)))
+ for (int ic = 0; ic < INPUT0_FEATURE_NUM; ic++)
+ wei[ic] = UNIT_BLOCK_READ(weights, offset + ic * filter_isv_pitch);
__attribute__((opencl_unroll_hint(OUTPUT_X_BLOCK_SIZE)))
for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++)
const uint buf_offset = (kw*DILATION_SIZE_X + STRIDE_SIZE_X * i + (kh) * INPUT_LINE_SIZE) / SUB_GROUP_SIZE;
const uint buf_group = (kw*DILATION_SIZE_X + STRIDE_SIZE_X * i + (kh) * INPUT_LINE_SIZE) % SUB_GROUP_SIZE;
- UNIT_TYPE src0 = intel_sub_group_shuffle(line_cache[0 * INPUT_BLOCK_SIZE + buf_offset], buf_group);
- UNIT_TYPE src1 = intel_sub_group_shuffle(line_cache[1 * INPUT_BLOCK_SIZE + buf_offset], buf_group);
- UNIT_TYPE src2 = intel_sub_group_shuffle(line_cache[2 * INPUT_BLOCK_SIZE + buf_offset], buf_group);
-
- dst[i] = mad(wei[0], src0, dst[i]);
- dst[i] = mad(wei[1], src1, dst[i]);
- dst[i] = mad(wei2, src2, dst[i]);
+ UNIT_TYPE src[INPUT0_FEATURE_NUM];
+ __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM)))
+ for (int ic = 0; ic < INPUT0_FEATURE_NUM; ic++) {
+ src[ic] = intel_sub_group_shuffle(line_cache[ic * INPUT_BLOCK_SIZE + buf_offset], buf_group);
+ dst[i] = mad(wei[ic], src[ic], dst[i]);
+ }
}
}
}
FUSED_OPS_VEC;
dst = FUSED_OPS_RESULT_VEC;
#endif
- // TODO Generalize for other block sizes
-#if OUTPUT_X_BLOCK_SIZE == 8
- UNIT_BLOCK_WRITE8(output, output_offset, dst);
-#elif OUTPUT_X_BLOCK_SIZE == 4
- UNIT_BLOCK_WRITE4(output, output_offset, dst);
-#elif OUTPUT_X_BLOCK_SIZE == 2
- UNIT_BLOCK_WRITE2(output, output_offset, dst);
-#elif OUTPUT_X_BLOCK_SIZE == 1
- UNIT_BLOCK_WRITE(output, output_offset, dst);
-#else
-# error convolution_gpu_bfyx_to_bfyx_f16.cl: Unsupported output x block size.
-#endif
+ UNIT_BLOCK_WRITEN(output, output_offset, dst);
} else {
const int x_tail = OUTPUT_SIZE_X - x;
for (int i = 0; i < x_tail; i++) {
fmt_prev == format::bfyx &&
((fmt_next == format::fs_b_yx_fsv32 && next.as<convolution>().get_primitive()->groups == 1) ||
(fmt_next == format::b_fs_yx_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
- (fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 &&
- (prev_output_layout.size.feature[0] == 3 || (prev_output_layout.size.feature[0] == 4 && (prev_dt == data_types::u8 || prev_dt == data_types::i8)))) ||
(fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] % 16 == 0 && prev_output_layout.size.feature[0] == 3) ||
(fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] == 3 &&
(next_output_layout.data_type != data_types::i8 && next_output_layout.data_type != data_types::u8))))
return true;
if (next.is_type<convolution>() &&
+ fmt_prev == format::bfyx &&
+ fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] <= 4)
+ return true;
+
+ if (next.is_type<convolution>() &&
fmt_prev == format::b_fs_yx_fsv4 &&
((fmt_next == format::b_fs_yx_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
(fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 &&
auto required_feature_num = weak_restrictions ? feature_block_size / 2 : feature_block_size;
auto correct_in_feature = (input_layout.size.feature[0] >= required_feature_num &&
weights_layout.size.batch[0] * weights_layout.size.group[0] >= required_feature_num);
- if (!correct_in_feature && input_layout.size.feature[0] == 3 && weights_layout.size.batch[0] >= feature_block_size)
+ if (!correct_in_feature && input_layout.size.feature[0] <= 4 && weights_layout.size.batch[0] >= feature_block_size)
correct_in_feature = true;
auto depthwise = conv->groups == static_cast<uint32_t>(input_layout.size.feature[0]); // depthwise conv
auto out_features_per_group = weights_layout.size.batch[0];