return 1;
}
+size_t PoolingKernel_b_fs_yx_fsv16::GetSimdSize(const pooling_params& params) const {
+ auto& out = params.output;
+ // Use smaller simd size in case of global pooling and small channels count to have more threads
+ if (out.X().v == 1 && out.Y().v == 1 && out.Feature().v < 64)
+ return 8;
+ else
+ return 16;
+}
+
PoolingKernelBase::DispatchData PoolingKernel_b_fs_yx_fsv16::SetDefault(const pooling_params& params) const {
DispatchData kd = PoolingKernelBase::SetDefault(params);
const auto& out = params.output;
- const size_t alignment = 16;
+ const size_t alignment = GetSimdSize(params);
size_t x_block_size = GetBlockSize(params);
auto x = out.X().v;
auto y = out.Y().v;
}
JitConstants PoolingKernel_b_fs_yx_fsv16::GetJitConstants(const pooling_params& params, DispatchData runInfo) const {
- const size_t alignment = 16;
+ const size_t alignment = GetSimdSize(params);
size_t x_block_size = GetBlockSize(params);
auto input = params.inputs[0];
auto output = params.output;
size_t input_line_size = params.poolStride.x * (x_block_size - 1) + params.poolSize.x;
+ auto acc_type = GetAccumulatorType(params);
+ jit.Merge(MakeTypeJitConstants(acc_type, "ACCUMULATOR"));
+
+ auto can_preload_full_line = [&]() -> bool {
+ const float max_reg_bytes = 128 * 32 * 0.95f;
+ const size_t line_bytes = input_line_size * BytesPerElement(input.GetDType());
+ const size_t acc_bytes = x_block_size * BytesPerElement(acc_type);
+
+ const float req_bytes = static_cast<float>((line_bytes + acc_bytes) * alignment);
+
+ return req_bytes < max_reg_bytes;
+ };
+
+ jit.AddConstant(MakeJitConstant("CAN_PRELOAD_FULL_LINE", can_preload_full_line()));
jit.AddConstant(MakeJitConstant("PADDED_INPUT", params.inputs[0].X().pad.Total() != 0));
jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", x_block_size));
jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", alignment));
jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(output.X().v, x_block_size)));
jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
- jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
if (params.output.Feature().v % 16 != 0) {
jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
FusedOpsConfiguration conf_vec = {"_VEC",
- {"b", "(f_block*16)", "y", "x"},
+ {"b", "(f_block*FEATURE_SLICE_SIZE + f_val*SUB_GROUP_SIZE)", "y", "x"},
"pool_result",
input_dt,
x_block_size,
IndexType::TENSOR_COORD,
Tensor::DataChannelName::X};
FusedOpsConfiguration conf_scalar = {"_SCALAR",
- {"b", "(f_block*16)", "y", "(x+i)"},
+ {"b", "(f_block*FEATURE_SLICE_SIZE + f_val*SUB_GROUP_SIZE)", "y", "(x+i)"},
"pool_result[i]",
input_dt,
1,
#define INIT_VAL ACCUMULATOR_VAL_ZERO
#endif
-__attribute__((intel_reqd_sub_group_size(16)))
+__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
KERNEL(pooling_gpu_blocked)(
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output
)
{
const int lid = get_sub_group_local_id();
+#if SUB_GROUP_SIZE == 16
const int f_block = get_group_id(1);
+ const int f_val = 0;
+#else
+ const int f_block = (uint)get_group_id(1) / (FEATURE_SLICE_SIZE / SUB_GROUP_SIZE);
+ const int f_val = (uint)get_group_id(1) % (FEATURE_SLICE_SIZE / SUB_GROUP_SIZE);
+#endif
const int b = get_global_id(2);
const int xy = get_global_id(0);
const uint input_offset = b * input_b_pitch +
(f_block + input_fs_pad_before) * input_fs_pitch +
(INPUT0_PAD_BEFORE_SIZE_Y + input_y) * input_y_pitch +
- (INPUT0_PAD_BEFORE_SIZE_X + input_x) * input_x_pitch;
+ (INPUT0_PAD_BEFORE_SIZE_X + input_x) * input_x_pitch +
+ f_val * SUB_GROUP_SIZE;
// Output offset calculations:
const uint output_x_pitch = FEATURE_SLICE_SIZE;
const uint output_offset = b * output_b_pitch +
(f_block + output_fs_pad_before) * output_fs_pitch +
(y + OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
- (x + OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;
+ (x + OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch +
+ f_val * SUB_GROUP_SIZE;
ACCUMULATOR_VAR_TYPE dst = (ACCUMULATOR_VAR_TYPE)INIT_VAL;
if (input_y + kh < 0 || input_y + kh >= INPUT0_SIZE_Y)
continue;
+#if CAN_PRELOAD_FULL_LINE
INPUT0_TYPE line_cache[INPUT_LINE_SIZE];
for (int i = 0; i < INPUT_LINE_SIZE; i++) {
if ((input_x + i) >= 0 && (input_x + i) < INPUT0_SIZE_X)
line_cache[i] = DT_INPUT_BLOCK_READ(input, input_offset + kh*input_y_pitch + i*input_x_pitch);
else
- #if defined MAX_POOLING
+ #if defined MAX_POOLING
line_cache[i] = INPUT0_VAL_MIN;
#elif defined AVG_POOLING
line_cache[i] = INPUT0_VAL_ZERO;
dst += src;
#endif
}
- }
+#else // CAN_PRELOAD_FULL_LINE
+ // TODO: try partial preload
+ for (int kw = 0; kw < POOL_SIZE_X; kw++)
+ {
+ INPUT_VAR_TYPE src;
+#if X_BLOCK_SIZE > 1
+ for (int i = 0; i < X_BLOCK_SIZE; i++) {
+ if ((input_x + kw + STRIDE_SIZE_X*i) >= 0 && (input_x + kw + STRIDE_SIZE_X*i) < INPUT0_SIZE_X)
+ src[i] = DT_INPUT_BLOCK_READ(input, input_offset + kh*input_y_pitch + (kw + STRIDE_SIZE_X*i)*input_x_pitch);
+ else
+ #if defined MAX_POOLING
+ src[i] = INPUT0_VAL_MIN;
+ #elif defined AVG_POOLING
+ src[i] = INPUT0_VAL_ZERO;
+ #endif
+ }
+#else
+ src = DT_INPUT_BLOCK_READ(input, input_offset + kh*input_y_pitch + kw*input_x_pitch);
+#endif
+#if defined MAX_POOLING
+ dst = ACCUMULATOR_MAX_FUNC(dst, src);
+#elif defined AVG_POOLING
+ dst += TO_ACCUMULATOR_VAR_TYPE(src);
+#endif
+ }
+#endif // CAN_PRELOAD_FULL_LINE
+ }
ACTIVATION_VAR_TYPE pool_result;
#if defined MAX_POOLING
- pool_result = TO_ACTIVATION_VAR_TYPE(dst);
- #if !HAS_FUSED_OP
- pool_result = ACTIVATION(pool_result, ACTIVATION_PARAMS);
- #endif
+ pool_result = TO_ACTIVATION_VAR_TYPE(dst);
#elif defined AVG_POOLING && (defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER))
- pool_result = TO_ACTIVATION_VAR_TYPE(dst*scale);
- #if !HAS_FUSED_OP
- pool_result = ACTIVATION(pool_result, ACTIVATION_PARAMS);
- #endif
+ pool_result = TO_ACTIVATION_VAR_TYPE(dst*scale);
#elif defined AVG_POOLING
- pool_result = TO_ACTIVATION_VAR_TYPE(dst/(POOL_SIZE_X*POOL_SIZE_Y));
- #if !HAS_FUSED_OP
- pool_result = ACTIVATION(pool_result, ACTIVATION_PARAMS);
- #endif
+ pool_result = TO_ACTIVATION_VAR_TYPE(dst/(POOL_SIZE_X*POOL_SIZE_Y));
+#endif
+
+#if !HAS_FUSED_OPS
+ pool_result = ACTIVATION(pool_result, ACTIVATION_PARAMS);
#endif
OUTPUT_VAR_TYPE final_result;
#if OUTPUT_LEFTOVERS
- if ((f_block+1)*FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM) {
+ if (f_block*FEATURE_SLICE_SIZE + (f_val + 1)*SUB_GROUP_SIZE >= OUTPUT_FEATURE_NUM) {
for (int i = 0; i < X_BLOCK_SIZE; i++) {
- if ((f_block*FEATURE_SLICE_SIZE + lid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) {
+ if ((f_block*FEATURE_SLICE_SIZE + f_val*SUB_GROUP_SIZE + lid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) {
#if X_BLOCK_SIZE > 1
- #if HAS_FUSED_OP
+ #if HAS_FUSED_OPS
FUSED_OPS_SCALAR;
final_result[i] = FUSED_OPS_RESULT_SCALAR;
#else
final_result = TO_OUTPUT_VAR_TYPE(pool_result);
#endif
output[output_offset + i * output_x_pitch + lid] = final_result;
-
#endif
}
}
final_result = TO_OUTPUT_VAR_TYPE(pool_result);
#endif
+#if SUB_GROUP_SIZE == FEATURE_SLICE_SIZE
#if X_BLOCK_SIZE == 8
DT_OUTPUT_BLOCK_WRITE8(output, output_offset, final_result);
#elif X_BLOCK_SIZE == 4
#elif X_BLOCK_SIZE == 1
DT_OUTPUT_BLOCK_WRITE(output, output_offset, final_result);
#endif
+#else
+ #if X_BLOCK_SIZE > 1
+ __attribute__((opencl_unroll_hint(X_BLOCK_SIZE)))
+ for (int i = 0; i < X_BLOCK_SIZE; i++) {
+ DT_OUTPUT_BLOCK_WRITE(output, output_offset + i * output_x_pitch, final_result[i]);
+ }
+ #else
+ DT_OUTPUT_BLOCK_WRITE(output, output_offset, final_result);
+ #endif
+#endif
}
else
{
- const int x_tail = OUTPUT_SIZE_X - x;
+ const int x_tail = OUTPUT_SIZE_X % X_BLOCK_SIZE;
for (int i = 0; i < x_tail; i++){
#if X_BLOCK_SIZE > 1
#if HAS_FUSED_OPS