From b3e69f1f7d17b3ceb3adcace09cb710f92dd8593 Mon Sep 17 00:00:00 2001
From: Konrad Dobros <konrad.dobros@intel.com>
Date: Fri, 4 Sep 2020 21:22:46 +0200
Subject: [PATCH] [IE CLDNN] Enable bfyx fully_connected for large batches
 (#1996)

This change enables fully_connected to use bfyx format for large
batches, when optimized bf_tiled kernel best performance.
---
 .../thirdparty/clDNN/src/fully_connected.cpp       | 65 +++++++++++++++-------
 .../clDNN/tests/test_cases/permute_gpu_test.cpp    |  6 +-
 2 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/inference-engine/thirdparty/clDNN/src/fully_connected.cpp b/inference-engine/thirdparty/clDNN/src/fully_connected.cpp
index ec5a9d4..040d631 100644
--- a/inference-engine/thirdparty/clDNN/src/fully_connected.cpp
+++ b/inference-engine/thirdparty/clDNN/src/fully_connected.cpp
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -50,6 +50,47 @@ bool is_batch_after_spatial(const std::string order) {
     }
     return false;
 }
+
+format::type get_preferred_format(const fully_connected_node& node) {
+    auto input_layout = node.input().get_output_layout();
+
+    if (data_type_traits::is_floating_point(input_layout.data_type) &&
+        (is_batch_after_spatial(input_layout.format.order()) ||
+         input_layout.format == format::bs_x_bsv16 ||
+         input_layout.format == format::bs_xs_xsv8_bsv8))
+        return format::yxfb;
+
+    bool no_spatial_padding = true;
+    for (auto pad : input_layout.data_padding.lower_size().spatial)
+        no_spatial_padding &= pad == 0;
+    for (auto pad : input_layout.data_padding.upper_size().spatial)
+        no_spatial_padding &= pad == 0;
+
+    if (input_layout.data_type == data_types::f32 &&
+        input_layout.format == format::bfyx &&
+        no_spatial_padding &&
+        input_layout.size.batch[0] != 8)
+        return format::bfyx;
+
+    auto input_pitches = input_layout.get_pitches();
+    if (input_layout.data_type == data_types::f16 &&
+        input_layout.format == format::bfyx &&
+        no_spatial_padding &&
+        input_pitches.batch[0] % 2 == 0 &&
+        input_layout.size.batch[0] != 16)
+        return format::bfyx;
+
+    // this condition tests whether our input is batch>1 in bfyx format, if yes there will be
+    // extra reorder between input and this fc from bfyx to yxfb format (so
+    // "is_batch_after_spatial" should return true)
+    if (data_type_traits::is_floating_point(input_layout.data_type) &&
+        input_layout.format == format::bfyx &&
+        input_layout.size.batch[0] > 1)
+        return format::yxfb;
+
+    return format::bfyx;
+}
+
 }  // namespace
 
 layout fully_connected_inst::calc_output_layout(fully_connected_node const& node) {
@@ -65,24 +106,10 @@ layout fully_connected_inst::calc_output_layout(fully_connected_node const& node
         output_type = node.get_fused_output_layout().data_type;
     }
 
-    if (data_type_traits::is_floating_point(input_layout.data_type) &&
-        (is_batch_after_spatial(input_layout.format.order()) ||
-        (input_layout.format ==
-             format::bfyx &&  // this condition tests whether our input is batch>1 in bfyx format, if yes there will be
-         input_layout.size.batch[0] > 1) ||  // extra reorder between input and this fc from bfyx to yxfb format (so
-                                             // "is_batch_after_spatial" should return true)
-        input_layout.format == format::bs_x_bsv16 ||
-        input_layout.format == format::bs_xs_xsv8_bsv8)) {
-        auto result = layout(output_type,
-                             format::yxfb,
-                             tensor(input_layout.size.batch[0], weights_layout.size.batch[0], 1, 1));
-        return result;
-    } else {
-        auto result = layout(output_type,
-                             format::bfyx,
-                             tensor(input_layout.size.batch[0], weights_layout.size.batch[0], 1, 1));
-        return result;
-    }
+    auto output_size = tensor(input_layout.size.batch[0], weights_layout.size.batch[0], 1, 1);
+    format output_format = get_preferred_format(node);
+
+    return layout(output_type, output_format, output_size);
 }
 
 std::string fully_connected_inst::to_string(fully_connected_node const& node) {
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/permute_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/permute_gpu_test.cpp
index 31fbc03..d60d6ea 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/permute_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/permute_gpu_test.cpp
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -550,8 +550,8 @@ TEST(fc_permute_crop_gpu, basic_0)
 
     const auto& engine = get_test_engine();
 
-    auto input_mem = memory::allocate(engine, { data_types::f32, format::bfyx,{ 5, 11264, 1, 1 } });
-    auto weights_mem = memory::allocate(engine, { data_types::f32, format::bfyx,{ 512, 11264, 1, 1 } });
+    auto input_mem = memory::allocate(engine, { data_types::f32, format::yxfb,{ 5, 11264, 1, 1 } });
+    auto weights_mem = memory::allocate(engine, { data_types::f32, format::yxio,{ 512, 11264, 1, 1 } });
     auto bias_mem = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 512, 1 } });
 
     topology topology(
-- 
2.7.4