From: Ilya Znamenskiy Date: Thu, 10 Sep 2020 05:56:04 +0000 (+0300) Subject: [IE CLDNN] Fully connected MMAD kernel optimizations (#2115) X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3797a28e65dcaab60888d57d8e62fcc0463d83c3;p=platform%2Fupstream%2Fdldt.git [IE CLDNN] Fully connected MMAD kernel optimizations (#2115) --- diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp index ceb6dc1..b560f6e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp @@ -17,10 +17,6 @@ namespace kernel_selector { -namespace { - static const size_t sub_group_size = 8; -} // namespace - ParamsKey FullyConnectedKernelMMAD::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::INT8); @@ -65,14 +61,32 @@ bool FullyConnectedKernelMMAD::Validate(const Params& params, const optional_par return true; } +FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::SetTuningParams(const fully_connected_params& params) const { + FullyConnectedTuningData tuning_data; + + const auto& input = params.inputs[0]; + + size_t feature_blocks_count = input.GetLayout() == DataLayout::bfyx && input.Feature().v % 32 != 0 ? + input.Feature().v / 32 : CeilDiv(input.Feature().v, 32); + + if (feature_blocks_count) + while (feature_blocks_count % (tuning_data.slm_div_factor * 2) == 0 && + (tuning_data.slm_div_factor * 2 <= params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size)) + tuning_data.slm_div_factor *= 2; + + tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size; + + return tuning_data; +} + FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(const fully_connected_params& params, int) const { + FullyConnectedTuningData tuning_data = SetTuningParams(params); auto runInfo = Parent::SetDefault(params); + const auto& output = params.output; - const auto& out = params.output; - - std::vector global = { Align(out.Feature().v, sub_group_size), out.Batch().v, 1 }; - auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo); + std::vector global = { Align(output.Feature().v, tuning_data.sub_group_size) * tuning_data.slm_div_factor, output.Batch().v, 1 }; + std::vector local = { tuning_data.work_group_size, 1, 1 }; runInfo.gws0 = global[0]; runInfo.gws1 = global[1]; @@ -87,12 +101,14 @@ FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(cons JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_params& params, const DispatchData& runInfo) const { + FullyConnectedTuningData tuning_data = SetTuningParams(params); + auto jit = Parent::GetJitConstants(params, runInfo); auto& input = params.inputs[0]; auto& weights = params.weights; - jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size)); + jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size)); if (input.GetDims().size() == 5) { jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(FILTER, f, 0, 0, 0)")); } else { @@ -137,13 +153,33 @@ JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_par jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", input.Feature().pitch * 32)); } + jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor)); + + size_t feature_blocks_count; + size_t temp_unroll_factor = 9, unroll_factor, full_unroll_factor; + if (input.GetLayout() == DataLayout::bfyx && input.Feature().v % 32 != 0) { + feature_blocks_count = input.Feature().v / 32; jit.AddConstant(MakeJitConstant("HAS_FEATURE_LEFTOVERS", true)); - jit.AddConstant(MakeJitConstant("FEATURE_BLOCKS_COUNT", input.Feature().v / 32)); } else { - jit.AddConstant(MakeJitConstant("FEATURE_BLOCKS_COUNT", CeilDiv(input.Feature().v, 32))); + feature_blocks_count = CeilDiv(input.Feature().v, 32); + } + + full_unroll_factor = feature_blocks_count / tuning_data.slm_div_factor; + + if (full_unroll_factor > 9) { + while (full_unroll_factor % temp_unroll_factor) + temp_unroll_factor--; + unroll_factor = temp_unroll_factor; + } else { + unroll_factor = full_unroll_factor; } + jit.AddConstant(MakeJitConstant("FEATURE_BLOCKS_COUNT", feature_blocks_count)); + jit.AddConstant(MakeJitConstant("UNROLL_FACTOR", unroll_factor)); + jit.AddConstant(MakeJitConstant("FULL_UNROLL_FACTOR", full_unroll_factor)); + jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size)); + jit.AddConstant(MakeJitConstant("MMAD_INPUT_SPATIAL_PITCH", input_x_pitch)); jit.AddConstant(MakeJitConstant("MMAD_INPUT_X_PITCH", input_x_pitch)); jit.AddConstant(MakeJitConstant("MMAD_INPUT_Y_PITCH", input_y_pitch)); @@ -158,7 +194,7 @@ JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_par if (!params.fused_ops.empty()) { auto input_dt = GetActivationType(params); - FusedOpsConfiguration conf = { "", {"b", "f", "0", "0"}, "dequantized", input_dt, 1 }; + FusedOpsConfiguration conf = { "", {"batch", "feature", "0", "0"}, "dequantized", input_dt, 1 }; jit.Merge(MakeFusedOpsJitConstants(params, { conf })); } @@ -180,7 +216,7 @@ KernelsData FullyConnectedKernelMMAD::GetKernelsData(const Params& params, const options, input.GetLayout(), w_layout, - FORCE_PRIORITY_9, + FORCE_PRIORITY_7, static_cast(i)); if (!kd.empty()) { res.emplace_back(kd[0]); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.h index 8f906a0..704b291 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.h @@ -29,6 +29,12 @@ public: KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; ParamsKey GetSupportedKey() const override; + struct FullyConnectedTuningData { + const size_t sub_group_size = 8; + size_t slm_div_factor = 1; + size_t work_group_size = 1; + }; + protected: JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override; DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override; @@ -38,5 +44,6 @@ protected: FusedOpType::ACTIVATION }; } bool Validate(const Params& params, const optional_params& options) const override; + FullyConnectedTuningData SetTuningParams(const fully_connected_params& params) const; }; } // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_MMAD.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_MMAD.cl index 43789ce..95fc65d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_MMAD.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_MMAD.cl @@ -37,25 +37,35 @@ KERNEL(fully_connected_gpu_MMAD)( #endif ) { -#if OUTPUT_BATCH_NUM == 1 - const uint f = (uint)get_global_id(0); - const uint b = 0; -#else - const uint f = (uint)get_global_id(0); - const uint b = (uint)get_global_id(1); -#endif + const uint lid0 = (uint)get_local_id(0); + const uint feature_per_wg = (uint)get_local_size(0) / SLM_DIV_FACTOR; + const uint feature = (uint)get_group_id(0) * feature_per_wg + (uint)get_global_id(0) % feature_per_wg; + const uint feature_block = lid0 / feature_per_wg; + const uint batch = (uint)get_global_id(1); int dotProd = 0; - const uint filter_offset = FILTER_GET_OFFSET(f); + const uint filter_offset = FILTER_GET_OFFSET(feature); #if INPUT0_DIMS == 5 - const uint input_offset = INPUT0_GET_INDEX(b, 0, 0, 0, 0); + const uint input_offset = INPUT0_GET_INDEX(batch, 0, 0, 0, 0); #else - const uint input_offset = INPUT0_GET_INDEX(b, 0, 0, 0); + const uint input_offset = INPUT0_GET_INDEX(batch, 0, 0, 0); +#endif + +#if SLM_DIV_FACTOR > 1 + __local int partial_summ[WORK_GROUP_SIZE]; #endif #if SPATIAL_MAJOR - for (uint k = 0; k < FEATURE_BLOCKS_COUNT; ++k) { + +#if FULL_UNROLL_FACTOR < 2 + for (uint k = feature_block * FULL_UNROLL_FACTOR; k < (feature_block + 1) * FULL_UNROLL_FACTOR; ++k) +#elif UNROLL_FACTOR == FULL_UNROLL_FACTOR + uint k = feature_block * FULL_UNROLL_FACTOR; +#else + for (uint k = feature_block * FULL_UNROLL_FACTOR; k + UNROLL_FACTOR <= (feature_block + 1) * FULL_UNROLL_FACTOR; k += UNROLL_FACTOR) +#endif + { # if !SPLIT_SPATIAL for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) { # else @@ -73,7 +83,15 @@ KERNEL(fully_connected_gpu_MMAD)( for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) { const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y; # endif - for (uint k = 0; k < FEATURE_BLOCKS_COUNT; ++k) { + +#if FULL_UNROLL_FACTOR < 2 + for (uint k = feature_block * FULL_UNROLL_FACTOR; k < (feature_block + 1) * FULL_UNROLL_FACTOR; ++k) +#elif UNROLL_FACTOR == FULL_UNROLL_FACTOR + uint k = feature_block * FULL_UNROLL_FACTOR; +#else + for (uint k = feature_block * FULL_UNROLL_FACTOR; k + UNROLL_FACTOR <= (feature_block + 1) * FULL_UNROLL_FACTOR; k += UNROLL_FACTOR) +#endif + { #endif #if !SPLIT_SPATIAL uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + k * MMAD_INPUT_FBLOCK_PITCH; @@ -82,10 +100,12 @@ KERNEL(fully_connected_gpu_MMAD)( #endif uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + k * MMAD_FILTER_FBLOCK_PITCH; +#if UNROLL_FACTOR < 2 uint input_data_u = intel_sub_group_block_read((const __global uint*)(input + input_idx)); INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u); - INPUT_PACKED_TYPE_8 activations; //activations of all lanes + INPUT_PACKED_TYPE_8 activations; + activations.s0 = sub_group_broadcast(input_data, 0); activations.s1 = sub_group_broadcast(input_data, 1); activations.s2 = sub_group_broadcast(input_data, 2); @@ -99,11 +119,50 @@ KERNEL(fully_connected_gpu_MMAD)( FILTER_PACKED_TYPE_8 weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u); dotProd = MMAD_8(activations, weights_data, dotProd); +#else + INPUT_PACKED_TYPE input_data[UNROLL_FACTOR]; + FILTER_PACKED_TYPE_8 weights_data[UNROLL_FACTOR]; + + __attribute__((opencl_unroll_hint)) + for (uint kb = 0; kb < UNROLL_FACTOR; kb++) { + input_data[kb] = AS_TYPE(INPUT_PACKED_TYPE, intel_sub_group_block_read((const __global uint*)(input + + input_idx + kb * MMAD_INPUT_FBLOCK_PITCH))); + + uint8 weights_data_u0 = intel_sub_group_block_read8((const __global uint*)(weights + filter_idx + kb * MMAD_FILTER_FBLOCK_PITCH)); + weights_data[kb] = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u0); + } + + __attribute__((opencl_unroll_hint)) + for (uint kb = 0; kb < UNROLL_FACTOR; kb++) { + INPUT_PACKED_TYPE_8 in; + + in.s0 = sub_group_broadcast(input_data[kb], 0); + in.s1 = sub_group_broadcast(input_data[kb], 1); + in.s2 = sub_group_broadcast(input_data[kb], 2); + in.s3 = sub_group_broadcast(input_data[kb], 3); + in.s4 = sub_group_broadcast(input_data[kb], 4); + in.s5 = sub_group_broadcast(input_data[kb], 5); + in.s6 = sub_group_broadcast(input_data[kb], 6); + in.s7 = sub_group_broadcast(input_data[kb], 7); + + dotProd = MMAD_8(in, weights_data[kb], dotProd); + } +#endif // UNROLL_FACTOR < 2 } } +#if SLM_DIV_FACTOR > 1 + partial_summ[lid0] = dotProd; + barrier(CLK_LOCAL_MEM_FENCE); + + if (feature_block == 0) { + __attribute__((opencl_unroll_hint)) + for (uint i = 1; i < SLM_DIV_FACTOR; i++) + dotProd += partial_summ[lid0 % feature_per_wg + i * feature_per_wg]; +#endif // SLM_DIV_FACTOR > 1 + #if HAS_FEATURE_LEFTOVERS - const uint lid = get_sub_group_local_id(); + const uint sglid = get_sub_group_local_id(); #if SPATIAL_MAJOR #if !SPLIT_SPATIAL for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) { @@ -128,14 +187,14 @@ KERNEL(fully_connected_gpu_MMAD)( #if !SPLIT_SPATIAL uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH; #else // !SPLIT_SPATIAL - uint input_idx = input_offset + FEATURE_BLOCK_COUNT * INPUT0_FEATURE_PITCH + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH; + uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH; #endif // !SPLIT_SPATIAL uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_FILTER_FBLOCK_PITCH; MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) input_data_u = (0, 0, 0, 0); for (uint i = 0; i < 4; i++) { - if (FEATURE_BLOCKS_COUNT*32 + lid*4 + i < INPUT0_FEATURE_NUM) { - input_data_u[i] = input[input_idx + (lid*4 + i)*INPUT0_FEATURE_PITCH]; + if (FEATURE_BLOCKS_COUNT * 32 + sglid * 4 + i < INPUT0_FEATURE_NUM) { + input_data_u[i] = input[input_idx + (sglid * 4 + i) * INPUT0_FEATURE_PITCH]; } } INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u); @@ -157,14 +216,14 @@ KERNEL(fully_connected_gpu_MMAD)( } #endif // HAS_FEATURE_LEFTOVERS - if (OUTPUT_FEATURE_NUM % SUB_GROUP_SIZE != 0 && f >= OUTPUT_FEATURE_NUM) + if (OUTPUT_FEATURE_NUM % SUB_GROUP_SIZE != 0 && feature >= OUTPUT_FEATURE_NUM) return; #if BIAS_TERM #if BIAS_PER_OUTPUT - const uint bias_index = GET_DATA_INDEX(BIAS, b, f, 0, 0); + const uint bias_index = GET_DATA_INDEX(BIAS, batch, feature, 0, 0); #elif BIAS_PER_OFM - const uint bias_index = f; + const uint bias_index = feature; #endif float dequantized = (float)dotProd + biases[bias_index]; @@ -172,7 +231,7 @@ KERNEL(fully_connected_gpu_MMAD)( float dequantized = (float)dotProd; #endif - const uint out_idx = OUTPUT_GET_INDEX(b, f, 0, 0); + const uint out_idx = OUTPUT_GET_INDEX(batch, feature, 0, 0); #if HAS_FUSED_OPS FUSED_OPS; @@ -182,6 +241,10 @@ KERNEL(fully_connected_gpu_MMAD)( #else output[out_idx] = TO_OUTPUT_TYPE(dequantized); #endif + +#if SLM_DIV_FACTOR > 1 + } +#endif } #undef INPUT_PACKED_TYPE_8