From e1c22196b4fd5ad8cc1ac9060e55c4f6e756359d Mon Sep 17 00:00:00 2001 From: Konrad Dobros Date: Fri, 12 Jun 2020 14:44:14 +0200 Subject: [PATCH] [IE CLDNN] Fix fsv16 -> bfyx reorder removal (#872) --- .../core/cl_kernels/include/fetch.cl | 22 +++++-- .../graph_optimizer/remove_redundant_reorders.cpp | 23 +++++-- inference-engine/thirdparty/clDNN/src/reorder.cpp | 13 ++-- .../clDNN/tests/test_cases/reorder_gpu_test.cpp | 74 ++++++++++++++++++++-- 4 files changed, 110 insertions(+), 22 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl index e48227f..6d2868d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl @@ -114,7 +114,8 @@ inline uint FUNC(get_bf8_xy16_index)(uint b, uint f, uint y, uint x, uint x_size } inline uint FUNC(get_b_fs_yx_fsv_index)(uint b, uint f, uint y, uint x, - uint x_size, uint y_size, uint f_size, + uint x_size, uint y_size, uint f_size, uint b_size, + uint b_pad_before, uint b_pad_after, uint f_pad_before, uint f_pad_after, uint y_pad_before, uint y_pad_after, uint x_pad_before, uint x_pad_after, uint alignment) { @@ -127,7 +128,7 @@ inline uint FUNC(get_b_fs_yx_fsv_index)(uint b, uint f, uint y, uint x, const uint fs_pitch = y_pitch * (y_pad_before + y_size + y_pad_after); const uint b_pitch = fs_pitch * ((total_f_size + alignment - 1) / alignment); - const uint output_offset = b * b_pitch + + const uint output_offset = (b_pad_before + b) * b_pitch + fs * fs_pitch + (y_pad_before + y) * y_pitch + (x_pad_before + x) * x_pitch @@ -137,7 +138,8 @@ inline uint FUNC(get_b_fs_yx_fsv_index)(uint b, uint f, uint y, uint x, } inline uint FUNC(get_b_fs_yx_fsv_index_safe)(uint b, uint f, uint y, uint x, - uint x_size, uint y_size, uint f_size, + uint x_size, uint y_size, uint f_size, uint b_size, + uint b_pad_before, uint b_pad_after, uint f_pad_before, uint f_pad_after, uint y_pad_before, uint y_pad_after, uint x_pad_before, uint x_pad_after, uint alignment) { @@ -150,7 +152,7 @@ inline uint FUNC(get_b_fs_yx_fsv_index_safe)(uint b, uint f, uint y, uint x, const uint fs_pitch = y_pitch * (y_pad_before + y_size + y_pad_after); const uint b_pitch = fs_pitch * ((total_f_size + alignment - 1) / alignment); - const uint output_offset = b * b_pitch + + const uint output_offset = (b_pad_before + (b % b_size)) * b_pitch + fs * fs_pitch + (y_pad_before + (y % y_size)) * y_pitch + (x_pad_before + (x % x_size)) * x_pitch @@ -165,6 +167,9 @@ inline uint FUNC(get_b_fs_yx_fsv_index_safe)(uint b, uint f, uint y, uint x, CAT(prefix, _SIZE_X ), \ CAT(prefix, _SIZE_Y), \ CAT(prefix, _FEATURE_NUM), \ + CAT(prefix, _BATCH_NUM), \ + CAT(prefix, _PAD_BEFORE_BATCH_NUM), \ + CAT(prefix, _PAD_AFTER_BATCH_NUM), \ CAT(prefix, _PAD_BEFORE_FEATURE_NUM), \ CAT(prefix, _PAD_AFTER_FEATURE_NUM), \ CAT(prefix, _PAD_BEFORE_SIZE_Y), \ @@ -178,6 +183,9 @@ inline uint FUNC(get_b_fs_yx_fsv_index_safe)(uint b, uint f, uint y, uint x, CAT(prefix, _SIZE_X ), \ CAT(prefix, _SIZE_Y), \ CAT(prefix, _FEATURE_NUM), \ + CAT(prefix, _BATCH_NUM), \ + CAT(prefix, _PAD_BEFORE_BATCH_NUM), \ + CAT(prefix, _PAD_AFTER_BATCH_NUM), \ CAT(prefix, _PAD_BEFORE_FEATURE_NUM), \ CAT(prefix, _PAD_AFTER_FEATURE_NUM), \ CAT(prefix, _PAD_BEFORE_SIZE_Y), \ @@ -191,6 +199,9 @@ inline uint FUNC(get_b_fs_yx_fsv_index_safe)(uint b, uint f, uint y, uint x, CAT(prefix, _SIZE_X ), \ CAT(prefix, _SIZE_Y), \ CAT(prefix, _FEATURE_NUM), \ + CAT(prefix, _BATCH_NUM), \ + CAT(prefix, _PAD_BEFORE_BATCH_NUM), \ + CAT(prefix, _PAD_AFTER_BATCH_NUM), \ CAT(prefix, _PAD_BEFORE_FEATURE_NUM), \ CAT(prefix, _PAD_AFTER_FEATURE_NUM), \ CAT(prefix, _PAD_BEFORE_SIZE_Y), \ @@ -204,6 +215,9 @@ inline uint FUNC(get_b_fs_yx_fsv_index_safe)(uint b, uint f, uint y, uint x, CAT(prefix, _SIZE_X ), \ CAT(prefix, _SIZE_Y), \ CAT(prefix, _FEATURE_NUM), \ + CAT(prefix, _BATCH_NUM), \ + CAT(prefix, _PAD_BEFORE_BATCH_NUM), \ + CAT(prefix, _PAD_AFTER_BATCH_NUM), \ CAT(prefix, _PAD_BEFORE_FEATURE_NUM), \ CAT(prefix, _PAD_AFTER_FEATURE_NUM), \ CAT(prefix, _PAD_BEFORE_SIZE_Y), \ diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp index f7a87ec..ef1b478 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp @@ -179,14 +179,27 @@ void remove_redundant_reorders::run(program_impl& p) { // but pads need to be handled correctly. if (i_layout.format == format::b_fs_yx_fsv16 && o_layout.format == format::bfyx && !r_node.is_output() && i_layout.size.spatial[0] == 1 && i_layout.size.spatial[1] == 1 && - o_layout.data_padding.upper_size() == (tensor)0 && o_layout.data_padding.lower_size() == (tensor)0) { + i_layout.data_padding.upper_size().spatial[0] == 0 && i_layout.data_padding.lower_size().spatial[0] == 0 && + i_layout.data_padding.upper_size().spatial[1] == 0 && i_layout.data_padding.lower_size().spatial[1] == 0 && + o_layout.data_padding.upper_size() == (tensor)0 && o_layout.data_padding.lower_size() == (tensor)0 && + i_layout.data_type == o_layout.data_type) { r_node.can_be_optimized(true); + r_node.requires_reinterpret(true); + + auto pad_lo = o_layout.data_padding.lower_size(); + auto pad_hi = o_layout.data_padding.upper_size(); + + pad_lo.batch[0] = i_layout.data_padding.lower_size().batch[0]; + pad_hi.batch[0] = i_layout.data_padding.upper_size().batch[0]; + + pad_lo.feature[0] = i_layout.data_padding.lower_size().feature[0]; + pad_hi.feature[0] = i_layout.data_padding.upper_size().feature[0]; + if (i_layout.size.feature[0] % 16 != 0) { - auto pad_lo = o_layout.data_padding.lower_size(); - auto pad_hi = o_layout.data_padding.upper_size(); - pad_hi.feature[0] = i_layout.size.feature[0] % 16; - r_node.merge_output_padding(padding{pad_lo.sizes(), pad_hi.sizes()}); + pad_hi.feature[0] += 16 - i_layout.size.feature[0] % 16; } + + r_node.merge_output_padding(padding{pad_lo.sizes(), pad_hi.sizes()}); continue; } diff --git a/inference-engine/thirdparty/clDNN/src/reorder.cpp b/inference-engine/thirdparty/clDNN/src/reorder.cpp index 0f74654..857d76d 100644 --- a/inference-engine/thirdparty/clDNN/src/reorder.cpp +++ b/inference-engine/thirdparty/clDNN/src/reorder.cpp @@ -200,10 +200,8 @@ std::string reorder_inst::to_string(reorder_node const& node) { reorder_inst::typed_primitive_inst(network_impl& network, reorder_node const& node) : parent(network, node, !node.can_be_optimized()) { - if (node.can_be_optimized()) { - build_deps(); + if (node.can_be_optimized()) reuse_input(); - } auto input_layout = node.input().get_output_layout(); auto output_layout = node.get_output_layout(); @@ -241,13 +239,14 @@ void reorder_inst::on_execute() { } void reorder_inst::reuse_input() { - if (!node.can_be_optimized()) + if (static_cast(_output) && _network.get_engine().is_the_same_buffer(output_memory(), input_memory())) return; + build_deps(); + if (node.requires_reinterpret()) { - if (!_output || !_network.get_engine().is_the_same_buffer(output_memory(), input_memory())) - _output = _network.get_engine().reinterpret_buffer(input_memory(), node.get_output_layout()); - } else if (!_output) { + _output = _network.get_engine().reinterpret_buffer(input_memory(), node.get_output_layout()); + } else { _output = (memory_impl::ptr) &input_memory(); } } diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp index 6926224..8db61f2 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp @@ -1782,9 +1782,10 @@ TEST(reorder_gpu_f32, b_fs_yx_fsv16_to_bfyx_opt_allowed) { const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::f32, format::b_fs_yx_fsv16, { 1, 8, 1, 1 } }); + auto input = memory::allocate(engine, { data_types::f32, format::b_fs_yx_fsv16, { 2, 12, 1, 1 } }); - set_values(input, { 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f }); + set_values(input, { 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f, 25.f, 26.f, 27.f, 28.f, 29.f, 30.f, 31.f }); const std::string reorder_name = "reorder_prim"; topology topology( @@ -1808,14 +1809,16 @@ TEST(reorder_gpu_f32, b_fs_yx_fsv16_to_bfyx_opt_allowed) auto output = outputs.begin()->second.get_memory(); - float answers[16] = { - 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, + float answers[24] = { + 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, + 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f, 25.f, 26.f, 27.f, }; auto output_ptr = output.pointer(); + ASSERT_EQ(output_ptr.size(), 24); for (size_t i = 0; i < output_ptr.size(); i++) { - EXPECT_FLOAT_EQ(answers[i], output_ptr[i]); + EXPECT_FLOAT_EQ(answers[i], output_ptr[i]) << "i=" << i; } } @@ -1858,7 +1861,66 @@ TEST(reorder_gpu_f32, b_fs_yx_fsv16_to_bfyx_opt_not_allowed) auto output_ptr = output.pointer(); for (int i = 0; i < 1; i++) { - EXPECT_FLOAT_EQ(answers[i], output_ptr[i]); + EXPECT_FLOAT_EQ(answers[i], output_ptr[i]) << "i=" << i; + } +} + +TEST(reorder_gpu_f32, b_fs_yx_fsv16_to_bfyx_opt_padded) +{ + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, + format::b_fs_yx_fsv16, + { 2, 4, 1, 1 }, + padding({1, 16, 0, 0}, {1, 0, 0, 0}) }); + + std::vector in_data = { + // b -1 (lower pad) + -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, + -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, + // b 0 + -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, + 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, + // b 1 + -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, + 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f, 25.f, 26.f, 27.f, 28.f, 29.f, 30.f, 31.f, + // b +1 (upper pad) + -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, + -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, + }; + + set_values(input, in_data); + + const std::string reorder_name = "reorder_prim"; + topology topology( + input_layout("input", input.get_layout()), + reorder(reorder_name, "input", format::bfyx, data_types::f32), + activation("activation", reorder_name, activation_func::abs)); + + build_options bo; + bo.set_option(build_option::optimize_data(true)); + network network(engine, topology, bo); + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto executed_prims = network.get_executed_primitives(); + + EXPECT_TRUE(executed_prims.find(reorder_name) == executed_prims.end()); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "activation"); + + auto output = outputs.begin()->second.get_memory(); + + float answers[8] = { + 0.f, 1.f, 2.f, 3.f, + 16.f, 17.f, 18.f, 19.f, + }; + + auto output_ptr = output.pointer(); + ASSERT_EQ(output_ptr.size(), 8); + for (size_t i = 0; i < output_ptr.size(); i++) { + EXPECT_FLOAT_EQ(answers[i], output_ptr[i]) << "i=" << i; } } -- 2.7.4