From e1c22196b4fd5ad8cc1ac9060e55c4f6e756359d Mon Sep 17 00:00:00 2001
From: Konrad Dobros <konrad.dobros@intel.com>
Date: Fri, 12 Jun 2020 14:44:14 +0200
Subject: [PATCH] [IE CLDNN] Fix fsv16 -> bfyx reorder removal (#872)

---
 .../core/cl_kernels/include/fetch.cl               | 22 +++++--
 .../graph_optimizer/remove_redundant_reorders.cpp  | 23 +++++--
 inference-engine/thirdparty/clDNN/src/reorder.cpp  | 13 ++--
 .../clDNN/tests/test_cases/reorder_gpu_test.cpp    | 74 ++++++++++++++++++++--
 4 files changed, 110 insertions(+), 22 deletions(-)

diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl
index e48227f..6d2868d 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl
@@ -114,7 +114,8 @@ inline uint FUNC(get_bf8_xy16_index)(uint b, uint f, uint y, uint x, uint x_size
 }
 
 inline uint FUNC(get_b_fs_yx_fsv_index)(uint b, uint f, uint y, uint x,
-                                        uint x_size, uint y_size, uint f_size,
+                                        uint x_size, uint y_size, uint f_size, uint b_size,
+                                        uint b_pad_before, uint b_pad_after,
                                         uint f_pad_before, uint f_pad_after,
                                         uint y_pad_before, uint y_pad_after,
                                         uint x_pad_before, uint x_pad_after, uint alignment) {
@@ -127,7 +128,7 @@ inline uint FUNC(get_b_fs_yx_fsv_index)(uint b, uint f, uint y, uint x,
     const uint fs_pitch = y_pitch * (y_pad_before +  y_size + y_pad_after);
     const uint b_pitch = fs_pitch * ((total_f_size + alignment - 1) / alignment);
 
-    const uint output_offset =  b * b_pitch +
+    const uint output_offset =  (b_pad_before + b) * b_pitch +
                                 fs * fs_pitch +
                                 (y_pad_before + y) * y_pitch +
                                 (x_pad_before + x) * x_pitch
@@ -137,7 +138,8 @@ inline uint FUNC(get_b_fs_yx_fsv_index)(uint b, uint f, uint y, uint x,
 }
 
 inline uint FUNC(get_b_fs_yx_fsv_index_safe)(uint b, uint f, uint y, uint x,
-                                             uint x_size, uint y_size, uint f_size,
+                                             uint x_size, uint y_size, uint f_size, uint b_size,
+                                             uint b_pad_before, uint b_pad_after,
                                              uint f_pad_before, uint f_pad_after,
                                              uint y_pad_before, uint y_pad_after,
                                              uint x_pad_before, uint x_pad_after, uint alignment) {
@@ -150,7 +152,7 @@ inline uint FUNC(get_b_fs_yx_fsv_index_safe)(uint b, uint f, uint y, uint x,
     const uint fs_pitch = y_pitch * (y_pad_before +  y_size + y_pad_after);
     const uint b_pitch = fs_pitch * ((total_f_size + alignment - 1) / alignment);
 
-    const uint output_offset = b * b_pitch +
+    const uint output_offset = (b_pad_before + (b % b_size)) * b_pitch +
                                fs * fs_pitch +
                                (y_pad_before + (y % y_size)) * y_pitch +
                                (x_pad_before + (x % x_size)) * x_pitch
@@ -165,6 +167,9 @@ inline uint FUNC(get_b_fs_yx_fsv_index_safe)(uint b, uint f, uint y, uint x,
         CAT(prefix, _SIZE_X ),                           \
         CAT(prefix, _SIZE_Y),                            \
         CAT(prefix, _FEATURE_NUM),                       \
+        CAT(prefix, _BATCH_NUM),                         \
+        CAT(prefix, _PAD_BEFORE_BATCH_NUM),              \
+        CAT(prefix, _PAD_AFTER_BATCH_NUM),               \
         CAT(prefix, _PAD_BEFORE_FEATURE_NUM),            \
         CAT(prefix, _PAD_AFTER_FEATURE_NUM),             \
         CAT(prefix, _PAD_BEFORE_SIZE_Y),                 \
@@ -178,6 +183,9 @@ inline uint FUNC(get_b_fs_yx_fsv_index_safe)(uint b, uint f, uint y, uint x,
         CAT(prefix, _SIZE_X ),                                \
         CAT(prefix, _SIZE_Y),                                 \
         CAT(prefix, _FEATURE_NUM),                            \
+        CAT(prefix, _BATCH_NUM),                              \
+        CAT(prefix, _PAD_BEFORE_BATCH_NUM),                   \
+        CAT(prefix, _PAD_AFTER_BATCH_NUM),                    \
         CAT(prefix, _PAD_BEFORE_FEATURE_NUM),                 \
         CAT(prefix, _PAD_AFTER_FEATURE_NUM),                  \
         CAT(prefix, _PAD_BEFORE_SIZE_Y),                      \
@@ -191,6 +199,9 @@ inline uint FUNC(get_b_fs_yx_fsv_index_safe)(uint b, uint f, uint y, uint x,
         CAT(prefix, _SIZE_X ),                           \
         CAT(prefix, _SIZE_Y),                            \
         CAT(prefix, _FEATURE_NUM),                       \
+        CAT(prefix, _BATCH_NUM),                         \
+        CAT(prefix, _PAD_BEFORE_BATCH_NUM),              \
+        CAT(prefix, _PAD_AFTER_BATCH_NUM),               \
         CAT(prefix, _PAD_BEFORE_FEATURE_NUM),            \
         CAT(prefix, _PAD_AFTER_FEATURE_NUM),             \
         CAT(prefix, _PAD_BEFORE_SIZE_Y),                 \
@@ -204,6 +215,9 @@ inline uint FUNC(get_b_fs_yx_fsv_index_safe)(uint b, uint f, uint y, uint x,
         CAT(prefix, _SIZE_X ),                                \
         CAT(prefix, _SIZE_Y),                                 \
         CAT(prefix, _FEATURE_NUM),                            \
+        CAT(prefix, _BATCH_NUM),                              \
+        CAT(prefix, _PAD_BEFORE_BATCH_NUM),                   \
+        CAT(prefix, _PAD_AFTER_BATCH_NUM),                    \
         CAT(prefix, _PAD_BEFORE_FEATURE_NUM),                 \
         CAT(prefix, _PAD_AFTER_FEATURE_NUM),                  \
         CAT(prefix, _PAD_BEFORE_SIZE_Y),                      \
diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp
index f7a87ec..ef1b478 100644
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp
@@ -179,14 +179,27 @@ void remove_redundant_reorders::run(program_impl& p) {
         // but pads need to be handled correctly.
         if (i_layout.format == format::b_fs_yx_fsv16 && o_layout.format == format::bfyx && !r_node.is_output() &&
             i_layout.size.spatial[0] == 1 && i_layout.size.spatial[1] == 1 &&
-            o_layout.data_padding.upper_size() == (tensor)0 && o_layout.data_padding.lower_size() == (tensor)0) {
+            i_layout.data_padding.upper_size().spatial[0] == 0 && i_layout.data_padding.lower_size().spatial[0] == 0 &&
+            i_layout.data_padding.upper_size().spatial[1] == 0 && i_layout.data_padding.lower_size().spatial[1] == 0 &&
+            o_layout.data_padding.upper_size() == (tensor)0 && o_layout.data_padding.lower_size() == (tensor)0 &&
+            i_layout.data_type == o_layout.data_type) {
             r_node.can_be_optimized(true);
+            r_node.requires_reinterpret(true);
+
+            auto pad_lo = o_layout.data_padding.lower_size();
+            auto pad_hi = o_layout.data_padding.upper_size();
+
+            pad_lo.batch[0] = i_layout.data_padding.lower_size().batch[0];
+            pad_hi.batch[0] = i_layout.data_padding.upper_size().batch[0];
+
+            pad_lo.feature[0] = i_layout.data_padding.lower_size().feature[0];
+            pad_hi.feature[0] = i_layout.data_padding.upper_size().feature[0];
+
             if (i_layout.size.feature[0] % 16 != 0) {
-                auto pad_lo = o_layout.data_padding.lower_size();
-                auto pad_hi = o_layout.data_padding.upper_size();
-                pad_hi.feature[0] = i_layout.size.feature[0] % 16;
-                r_node.merge_output_padding(padding{pad_lo.sizes(), pad_hi.sizes()});
+                pad_hi.feature[0] += 16 - i_layout.size.feature[0] % 16;
             }
+
+            r_node.merge_output_padding(padding{pad_lo.sizes(), pad_hi.sizes()});
             continue;
         }
 
diff --git a/inference-engine/thirdparty/clDNN/src/reorder.cpp b/inference-engine/thirdparty/clDNN/src/reorder.cpp
index 0f74654..857d76d 100644
--- a/inference-engine/thirdparty/clDNN/src/reorder.cpp
+++ b/inference-engine/thirdparty/clDNN/src/reorder.cpp
@@ -200,10 +200,8 @@ std::string reorder_inst::to_string(reorder_node const& node) {
 
 reorder_inst::typed_primitive_inst(network_impl& network, reorder_node const& node)
     : parent(network, node, !node.can_be_optimized()) {
-    if (node.can_be_optimized()) {
-        build_deps();
+    if (node.can_be_optimized())
         reuse_input();
-    }
 
     auto input_layout = node.input().get_output_layout();
     auto output_layout = node.get_output_layout();
@@ -241,13 +239,14 @@ void reorder_inst::on_execute() {
 }
 
 void reorder_inst::reuse_input() {
-    if (!node.can_be_optimized())
+    if (static_cast<bool>(_output) && _network.get_engine().is_the_same_buffer(output_memory(), input_memory()))
         return;
 
+    build_deps();
+
     if (node.requires_reinterpret()) {
-        if (!_output || !_network.get_engine().is_the_same_buffer(output_memory(), input_memory()))
-            _output = _network.get_engine().reinterpret_buffer(input_memory(), node.get_output_layout());
-    } else if (!_output) {
+        _output = _network.get_engine().reinterpret_buffer(input_memory(), node.get_output_layout());
+    } else {
         _output = (memory_impl::ptr) &input_memory();
     }
 }
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp
index 6926224..8db61f2 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp
@@ -1782,9 +1782,10 @@ TEST(reorder_gpu_f32, b_fs_yx_fsv16_to_bfyx_opt_allowed)
 {
     const auto& engine = get_test_engine();
 
-    auto input = memory::allocate(engine, { data_types::f32, format::b_fs_yx_fsv16, { 1, 8, 1, 1 } });
+    auto input = memory::allocate(engine, { data_types::f32, format::b_fs_yx_fsv16, { 2, 12, 1, 1 } });
 
-    set_values(input, { 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f });
+    set_values(input, { 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f,
+                        16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f, 25.f, 26.f, 27.f, 28.f, 29.f, 30.f, 31.f });
 
     const std::string reorder_name = "reorder_prim";
     topology topology(
@@ -1808,14 +1809,16 @@ TEST(reorder_gpu_f32, b_fs_yx_fsv16_to_bfyx_opt_allowed)
 
     auto output = outputs.begin()->second.get_memory();
 
-    float answers[16] = {
-            0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f,
+    float answers[24] = {
+            0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f,
+            16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f, 25.f, 26.f, 27.f,
     };
 
     auto output_ptr = output.pointer<float>();
+    ASSERT_EQ(output_ptr.size(), 24);
     for (size_t i = 0; i < output_ptr.size(); i++)
     {
-        EXPECT_FLOAT_EQ(answers[i], output_ptr[i]);
+        EXPECT_FLOAT_EQ(answers[i], output_ptr[i]) << "i=" << i;
     }
 }
 
@@ -1858,7 +1861,66 @@ TEST(reorder_gpu_f32, b_fs_yx_fsv16_to_bfyx_opt_not_allowed)
     auto output_ptr = output.pointer<float>();
     for (int i = 0; i < 1; i++)
     {
-        EXPECT_FLOAT_EQ(answers[i], output_ptr[i]);
+        EXPECT_FLOAT_EQ(answers[i], output_ptr[i]) << "i=" << i;
+    }
+}
+
+TEST(reorder_gpu_f32, b_fs_yx_fsv16_to_bfyx_opt_padded)
+{
+    const auto& engine = get_test_engine();
+
+    auto input = memory::allocate(engine, { data_types::f32,
+                                            format::b_fs_yx_fsv16,
+                                            { 2, 4, 1, 1 },
+                                            padding({1, 16, 0, 0}, {1, 0, 0, 0}) });
+
+    std::vector<float> in_data = {
+        // b -1 (lower pad)
+        -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f,
+        -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f,
+        // b 0
+        -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f,
+        0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f,
+        // b 1
+        -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f,
+        16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f, 25.f, 26.f, 27.f, 28.f, 29.f, 30.f, 31.f,
+        // b +1 (upper pad)
+        -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f,
+        -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f,
+    };
+
+    set_values(input, in_data);
+
+    const std::string reorder_name = "reorder_prim";
+    topology topology(
+        input_layout("input", input.get_layout()),
+        reorder(reorder_name, "input", format::bfyx, data_types::f32),
+        activation("activation", reorder_name, activation_func::abs));
+
+    build_options bo;
+    bo.set_option(build_option::optimize_data(true));
+    network network(engine, topology, bo);
+    network.set_input_data("input", input);
+
+    auto outputs = network.execute();
+
+    auto executed_prims = network.get_executed_primitives();
+
+    EXPECT_TRUE(executed_prims.find(reorder_name) == executed_prims.end());
+    EXPECT_EQ(outputs.size(), size_t(1));
+    EXPECT_EQ(outputs.begin()->first, "activation");
+
+    auto output = outputs.begin()->second.get_memory();
+
+    float answers[8] = {
+            0.f, 1.f, 2.f, 3.f,
+            16.f, 17.f, 18.f, 19.f,
+    };
+
+    auto output_ptr = output.pointer<float>();
+    ASSERT_EQ(output_ptr.size(), 8);
+    for (size_t i = 0; i < output_ptr.size(); i++) {
+        EXPECT_FLOAT_EQ(answers[i], output_ptr[i]) << "i=" << i;
     }
 }
 
-- 
2.7.4