From 8db6eeec67b0caea2a2515f3b6060dddb2626e1a Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Mon, 17 Aug 2020 14:34:49 +0300 Subject: [PATCH] [IE CLDNN] All input layouts support in ref pooling (#1782) --- .../actual_kernels/pooling/pooling_kernel_base.cpp | 5 +++- .../pooling_kernel_gpu_b_fs_yx_fsv16_imad.cpp | 2 +- .../pooling/pooling_kernel_gpu_ref.cpp | 18 ++---------- .../cl_kernels/pooling_gpu_b_fs_yx_fsv16_imad.cl | 11 ++++---- .../core/cl_kernels/pooling_gpu_ref.cl | 15 ++++++---- .../thirdparty/clDNN/src/gpu/pooling_gpu.cpp | 32 ++++++++++++++-------- .../clDNN/tests/test_cases/pooling_gpu_test.cpp | 1 + 7 files changed, 42 insertions(+), 42 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp index f9486d7..c5fa8ca 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp @@ -29,6 +29,9 @@ bool PoolingKernelBase::Validate(const Params& p, const optional_params& o) cons return false; } + if (params.inputs[0].Dimentions() > 5) + return false; + return true; } @@ -157,7 +160,7 @@ PoolingKernelBase::DispatchData PoolingKernelBase::SetDefault(const pooling_para // Determine global work sizes. kd.gws0 = output.Batch().v * output.Feature().v; // B, F kd.gws1 = output.X().v; // X - kd.gws2 = output.Y().v; // Y + kd.gws2 = output.Y().v * output.Z().v; // Y * Z kd.lws0 = std::min(std::max(kd.gws0, static_cast(1)), static_cast(32)); while (kd.gws0 % kd.lws0 != 0) { diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16_imad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16_imad.cpp index 9574e41..4e60fad 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16_imad.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16_imad.cpp @@ -76,7 +76,7 @@ JitConstants PoolingKernelGPU_b_fs_yx_fsv16_imad::GetJitConstants(const pooling_ if (!params.fused_ops.empty()) { auto input_dt = EnableRound(params) ? Datatype::INT32 : GetActivationType(params); - FusedOpsConfiguration conf = {"", {"b", "f", "y", "x"}, "pool_result[i]", input_dt, 1}; + FusedOpsConfiguration conf = {"", {"b", "(f+i)", "y", "x"}, "pool_result[i]", input_dt, 1}; conf.SetLoopAxes({ Tensor::DataChannelName::FEATURE }, true); jit.Merge(MakeFusedOpsJitConstants(params, { conf })); } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.cpp index 1f4bb27..67dfa1d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.cpp @@ -24,22 +24,8 @@ ParamsKey PoolingKernelGPURef::GetSupportedKey() const { k.EnableOutputDataType(Datatype::F32); k.EnableOutputDataType(Datatype::UINT8); k.EnableOutputDataType(Datatype::INT8); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableInputLayout(DataLayout::yxfb); - k.EnableInputLayout(DataLayout::byxf); - k.EnableInputLayout(DataLayout::bfzyx); - k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16); - k.EnableInputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16); - k.EnableInputLayout(DataLayout::b_fs_yx_fsv32); - k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::yxfb); - k.EnableOutputLayout(DataLayout::byxf); - k.EnableOutputLayout(DataLayout::bfzyx); - k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16); - k.EnableOutputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16); - k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32); - k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); k.EnableTensorOffset(); k.EnableTensorPitches(); k.EnableBatching(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv16_imad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv16_imad.cl index 91e272e..c20177f 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv16_imad.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv16_imad.cl @@ -140,7 +140,6 @@ KERNEL(pooling_gpu_b_fs_yx_fsv16)( #endif #endif - ACTIVATION_VEC16 pool_result; #if defined AVG_POOLING #if ENABLE_ROUND @@ -169,25 +168,25 @@ KERNEL(pooling_gpu_b_fs_yx_fsv16)( } #endif // AVG_POOLING -OUT_VEC16 final_result = (OUTPUT_TYPE)(0); + OUT_VEC16 final_result = (OUTPUT_TYPE)(0); #if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD - FUSED_OPS_PRELOAD + FUSED_OPS_PRELOAD; #endif __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE))) for (uint i = 0; i < FEATURE_SLICE_SIZE; ++i) { #if HAS_FUSED_OPS #if FUSED_OPS_CAN_USE_PRELOAD - FUSED_OPS_CALC + FUSED_OPS_CALC; #else - FUSED_OPS + FUSED_OPS; #endif final_result[i] = FUSED_OPS_RESULT; #else final_result[i] = TO_OUTPUT_TYPE(ACTIVATION(pool_result[i], ACTIVATION_PARAMS)); #endif } - + const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x); #if OUTPUT_TYPE_SIZE == 1 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl index 999ea6e..3b68dd2 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl @@ -44,7 +44,7 @@ KERNEL(pooling_gpu)( ) { #if OUTPUT_LAYOUT_BFYX || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_BFZYX ||\ - OUTPUT_LAYOUT_B_FS_ZYX_FSV16 || OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16 + OUTPUT_LAYOUT_B_FS_ZYX_FSV16 || OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16 || OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BYXF_AF32 const uint x = (uint)get_global_id(0); #if OUTPUT_DIMS == 5 const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y; @@ -81,15 +81,18 @@ KERNEL(pooling_gpu)( if (f >= OUTPUT_FEATURE_NUM) { return; } -#elif OUTPUT_LAYOUT_YXFB +#else const uint x = (uint)get_global_id(1); +#if OUTPUT_DIMS == 5 + const uint y = (uint)get_global_id(2) % OUTPUT_SIZE_Y; + const uint z = (uint)get_global_id(2) / OUTPUT_SIZE_Y; +#else const uint y = (uint)get_global_id(2); + const uint z = 0; +#endif const uint bf = (uint)get_global_id(0); const uint f = bf / INPUT0_BATCH_NUM; const uint b = bf % INPUT0_BATCH_NUM; - const uint z = 0; -#else - #error "pooling_gpu_ref: unsupported layout" #endif const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X; @@ -272,7 +275,7 @@ KERNEL(pooling_gpu)( OUTPUT_TYPE final_result; ACTIVATION_TYPE pool_result = TO_ACTIVATION_TYPE(result); - + #if HAS_FUSED_OPS FUSED_OPS; final_result = FUSED_OPS_RESULT; diff --git a/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp index 3c90500..86a8322 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp @@ -166,25 +166,31 @@ public: namespace detail { attach_pooling_gpu::attach_pooling_gpu() { - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), pooling_gpu::create); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), pooling_gpu::create); + + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), pooling_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::yxfb), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::yxfb), pooling_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf), pooling_gpu::create); - // block fp16 format + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16), pooling_gpu::create); - // block i8 format implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), pooling_gpu::create); - // 3D + + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfzyx), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfzyx), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfzyx), pooling_gpu::create); @@ -193,29 +199,31 @@ attach_pooling_gpu::attach_pooling_gpu() { implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_zyx_fsv16), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_zyx_fsv16), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv16), pooling_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv16), pooling_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create); - // MMAD + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), pooling_gpu::create); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), pooling_gpu::create); + + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), pooling_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv4), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), pooling_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv32), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv32), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv32), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv32), pooling_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv32), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv32), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_zyx_fsv32), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_zyx_fsv32), pooling_gpu::create); - // + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::fs_b_yx_fsv32), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::fs_b_yx_fsv32), pooling_gpu::create); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp index 7f64ebc..787197c 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp @@ -2698,6 +2698,7 @@ INSTANTIATE_TEST_CASE_P( format::b_fs_yx_fsv16, format::fs_b_yx_fsv32, format::b_fs_yx_fsv32, + format::b_fs_yx_fsv4, format::fs_bs_yx_bsv4_fsv32)), testing::internal::DefaultParamName); -- 2.7.4