From 0ec07b2c3bb8b1f326ce698b24b3d3707a0d8db9 Mon Sep 17 00:00:00 2001
From: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Date: Mon, 22 Jun 2020 17:09:39 +0300
Subject: [PATCH] [IE CLDNN] fsv4 to fsv16 conv (#1030)

---
 ...nvolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.cpp |   1 +
 .../convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl  |  24 ++++
 .../graph_optimizer/remove_redundant_reorders.cpp  |   2 +-
 .../thirdparty/clDNN/src/layout_optimizer.cpp      |   7 +
 .../tests/test_cases/convolution_gpu_test.cpp      | 144 ++++++++++++++++++++-
 5 files changed, 176 insertions(+), 2 deletions(-)

diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.cpp
index 792dc48..81c6869 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.cpp
@@ -32,6 +32,7 @@ ParamsKey ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv32::GetSupportedKey() const
     k.EnableOutputDataType(Datatype::F32);
     k.EnableInputWeightsType(WeightsType::INT8);
     k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
     k.EnableTensorOffset();
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl
index b30b27a..9f21243 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl
@@ -114,7 +114,19 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
 #if ASYMMETRIC_WEIGHTS_QUANTIZATION
     ACCUMULATOR_TYPE_VEC acc_assym_weights = 0;
 #endif
+
+#if INPUT0_LAYOUT_BFYX
     const int input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + input_y * INPUT0_Y_PITCH;
+#elif INPUT0_LAYOUT_B_FS_YX_FSV4
+    const int fsv = 4;
+    const int input_x_pitch = fsv;
+    const int input_y_pitch = input_x_pitch * (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X);
+    const int input_fs_pitch = input_y_pitch * (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y);
+    const int input_total_f_size = INPUT0_PAD_BEFORE_FEATURE_NUM + INPUT0_FEATURE_NUM + INPUT0_PAD_AFTER_FEATURE_NUM;
+    const int input_b_pitch = input_fs_pitch * ((input_total_f_size + fsv - 1) / fsv);
+    const int input_offset = b * input_b_pitch + input_y * input_y_pitch;
+#endif
+
     int filter_idx = fg * FILTER_SIZE_X * FILTER_SIZE_Y * ISV * OSV;
 #if ASYMMETRIC_WEIGHTS_QUANTIZATION
     char4 multiplier;
@@ -156,6 +168,7 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
                 bool x_cross_fm = x_chunk + lid < 0 || x_chunk + lid >= INPUT0_SIZE_X;
 
                 if (!x_cross_fm) {
+                #if INPUT0_LAYOUT_BFYX
                     MAKE_VECTOR_TYPE(INPUT0_TYPE, ISV) src = 0;
                     __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM)))
                     for (int i = 0; i < INPUT0_FEATURE_NUM; i++) {
@@ -164,6 +177,11 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
                                                     + (x_chunk + lid)* INPUT0_X_PITCH];
                     }
                     slm_block[c + lid] = AS_PACKED_IN_TYPE(src);
+                #elif INPUT0_LAYOUT_B_FS_YX_FSV4
+                    const __global uint* ptr = input + input_offset + kh * DILATION_SIZE_Y * input_y_pitch + (x_chunk + lid) * input_x_pitch;
+                    PACKED_IN_TYPE src = AS_PACKED_IN_TYPE(ptr[0]);
+                    slm_block[c + lid] = src;
+                #endif
                 } else {
 #if ASYMMETRIC_DATA_QUANTIZATION
                     slm_block[c + lid] = AS_PACKED_IN_TYPE(zp);
@@ -178,6 +196,7 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
                 const int x_chunk = x_wg_start + LWS1*SLM_CHUNK_SIZE;
                 bool x_cross_fm = x_chunk + lid >= INPUT0_SIZE_X;
                 if (!x_cross_fm) {
+                #if INPUT0_LAYOUT_BFYX
                     MAKE_VECTOR_TYPE(INPUT0_TYPE, ISV) src = 0;
                     __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM)))
                     for (int i = 0; i < INPUT0_FEATURE_NUM; i++) {
@@ -186,6 +205,11 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
                                                     + (x_chunk + lid)* INPUT0_X_PITCH];
                     }
                     slm_block_tail[lid] = AS_PACKED_IN_TYPE(src);
+                #elif INPUT0_LAYOUT_B_FS_YX_FSV4
+                    const __global uint* ptr = input + input_offset + kh * DILATION_SIZE_Y * input_y_pitch + (x_chunk + lid) * input_x_pitch;
+                    PACKED_IN_TYPE src = AS_PACKED_IN_TYPE(ptr[0]);
+                    slm_block_tail[lid] = src;
+                #endif
                 } else {
 #if ASYMMETRIC_DATA_QUANTIZATION
                     slm_block_tail[lid] = AS_PACKED_IN_TYPE(zp);
diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp
index ef1b478..1b588ee 100644
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp
@@ -84,7 +84,7 @@ void remove_redundant_reorders::run(program_impl& p) {
 
             auto output_padded = static_cast<bool>(output_layout.data_padding);
             auto can_omit_padding = (output_layout.format == format::b_fs_yx_fsv16 || output_layout.format == format::b_fs_yx_fsv32) &&
-                                    input.get_output_layout().format == format::bfyx;
+                                    (input.get_output_layout().format == format::bfyx || input.get_output_layout().format == format::b_fs_yx_fsv4);
 
             if (output_padded && !can_omit_padding) {
                 if (input.get_users().size() != 1)
diff --git a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
index 10d1389..1802fdc 100644
--- a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
+++ b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
@@ -214,6 +214,13 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
         (fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] % 16 == 0 && prev_output_layout.size.feature[0] == 3)))
         return true;
 
+    if (next.is_type<convolution>() &&
+        fmt_prev == format::b_fs_yx_fsv4 &&
+        ((fmt_next == format::b_fs_yx_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
+        (fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 &&
+        (prev_output_layout.size.feature[0] == 3 || (prev_output_layout.size.feature[0] == 4 && (prev_dt == data_types::u8 || prev_dt == data_types::i8))))))
+        return true;
+
     if (next.is_type<quantize>() && fmt_prev == format::bfyx && fmt_next == format::b_fs_yx_fsv16)
         return true;
 
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp
index 734e65d..2e9d23f 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp
@@ -7838,7 +7838,7 @@ public:
         return "conv";
     }
 
-    void run_expect(const VVVVF<OutputT>& expected) {
+    virtual void run_expect(const VVVVF<OutputT>& expected) {
         auto engine = get_test_engine();
 
         auto topo = build_topology(engine);
@@ -8118,6 +8118,135 @@ static std::string to_string_convolution_all_params(const testing::TestParamInfo
 }
 
 template <typename InputT, typename WeightsT, typename OutputT>
+class convolution_random_test_fsv4_input : public convolution_random_test_base<InputT, WeightsT, OutputT> {
+public:
+    using parent = convolution_random_test_base<InputT, WeightsT, OutputT>;
+    topology build_topology(const cldnn::engine& engine) override {
+        auto input_lay = layout(this->input_type(), format::b_fs_yx_fsv4, this->input_size());
+        auto wei_lay = layout(this->weights_type(), format::bfyx, this->weights_size());
+
+        auto wei_mem = memory::allocate(engine, wei_lay);
+        auto wei_flat = flatten_4d(format::bfyx, this->_weights);
+        set_values(wei_mem, wei_flat);
+        layout reordered_layout = layout{this->input_type(), this->input_format(), this->input_size(), this->padding_size()};
+        auto topo = topology();
+        topo.add(input_layout("input", input_lay));
+        topo.add(reorder("input_reorder", "input", reordered_layout));
+        std::string input_id = "input_reorder";
+        if (this->has_input_zp()) {
+            auto input_zp_lay = layout(this->input_type(), format::bfyx, tensor(feature(this->input_features())));
+            auto input_zp_mem = memory::allocate(engine, input_zp_lay);
+            set_values(input_zp_mem, this->_input_zp);
+            topo.add(data("input_zp", input_zp_mem));
+            topo.add(eltwise("input_asymm", { "input_reorder", "input_zp" }, eltwise_mode::sub));
+            input_id = "input_asymm";
+        }
+        topo.add(data("weights", wei_mem));
+        std::string weights_id = "weights";
+        if (this->has_weights_zp()) {
+            auto weights_zp_lay = layout(this->weights_type(), format::bfyx, tensor(batch(this->output_features())));
+            auto weights_zp_mem = memory::allocate(engine, weights_zp_lay);
+            set_values(weights_zp_mem, this->_weights_zp);
+            topo.add(data("weights_zp", weights_zp_mem));
+            topo.add(eltwise("weights_asymm", { "weights", "weights_zp" }, eltwise_mode::sub));
+            weights_id = "weights_asymm";
+        }
+        if (!this->has_bias()) {
+            auto conv_prim = convolution(
+                "conv",
+                input_id,
+                { weights_id },
+                static_cast<uint32_t>(this->groups()),
+                tensor(batch(0), feature(0), spatial(this->_stride_x, this->_stride_y)),
+                tensor(batch(0), feature(0), spatial(this->_offset_x, this->_offset_y)),
+                tensor(batch(0), feature(0), spatial(this->_dilation_x, this->_dilation_y)));
+            conv_prim.output_data_type = this->output_type();
+            topo.add(conv_prim);
+        } else {
+            auto bias_lay = layout(this->output_type(), format::bfyx, tensor(feature(this->output_features())));
+            auto bias_mem = memory::allocate(engine, bias_lay);
+            set_values(bias_mem, this->_bias);
+            topo.add(data("bias", bias_mem));
+            auto conv_prim = convolution(
+                "conv",
+                input_id,
+                { weights_id },
+                { "bias" },
+                static_cast<uint32_t>(this->groups()),
+                tensor(batch(0), feature(0), spatial(this->_stride_x, this->_stride_y)),
+                tensor(batch(0), feature(0), spatial(this->_offset_x, this->_offset_y)),
+                tensor(batch(0), feature(0), spatial(this->_dilation_x, this->_dilation_y)));
+            conv_prim.output_data_type = this->output_type();
+            topo.add(conv_prim);
+        }
+
+        return topo;
+    }
+    void run_expect(const VVVVF<OutputT>& expected) override {
+        auto engine = get_test_engine();
+
+        auto topo = this->build_topology(engine);
+
+        auto build_opts = build_options(
+            build_option::optimize_data(true),
+            build_option::force_implementations({ {"conv", { this->input_format(), ""}} })
+        );
+        auto prog = program(engine, topo, build_opts);
+
+        auto net = network(prog, 0);
+
+        auto input_lay = layout(this->input_type(), format::b_fs_yx_fsv4,  this->input_size());
+        auto input_mem = memory::allocate(engine, input_lay);
+        std::vector<InputT> input_flat(input_lay.get_linear_size(), static_cast<InputT>(0));
+        for (size_t bi = 0; bi < this->batch_num(); ++bi)
+            for (size_t fi = 0; fi < this->input_features(); ++fi)
+                for (size_t yi = 0; yi < this->input_y(); ++yi)
+                    for (size_t xi = 0; xi < this->input_x(); ++xi) {
+                        tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
+                        size_t offset = input_lay.get_linear_offset(coords);
+                        input_flat[offset] = this->_input[bi][fi][yi][xi];
+                    }
+        set_values(input_mem, input_flat);
+
+        net.set_input_data("input", input_mem);
+        auto result = net.execute();
+        auto out_mem = result.at(this->output_primitive_id()).get_memory();
+        auto out_lay = out_mem.get_layout();
+        auto out_ptr = out_mem.cldnn::memory::template pointer<OutputT>();
+
+        std::stringstream description;
+        for (auto i : net.get_primitives_info()) {
+            if (i.original_id == "conv") {
+                std::cout << i.kernel_id << std::endl;
+                description << "  kernel: " << i.kernel_id << std::endl;
+            }
+        }
+        description << "  executed: ";
+        for (auto e : net.get_executed_primitive_ids()) {
+            description << e << ", ";
+        }
+
+        ASSERT_EQ(out_lay.data_type, this->output_type());
+        ASSERT_EQ(out_lay.size.batch[0], expected.size());
+        ASSERT_EQ(out_lay.size.feature[0], expected[0].size());
+        ASSERT_EQ(out_lay.size.spatial[1], expected[0][0].size());
+        ASSERT_EQ(out_lay.size.spatial[0], expected[0][0][0].size());
+
+        for (size_t bi = 0; bi < this->batch_num(); ++bi)
+            for (size_t fi = 0; fi < this->output_features(); ++fi)
+                for (size_t yi = 0; yi < expected[0][0].size(); ++yi)
+                    for (size_t xi = 0; xi < expected[0][0][0].size(); ++xi) {
+                        tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
+                        size_t offset = out_lay.get_linear_offset(coords);
+
+                        ASSERT_EQ(out_ptr[offset], expected[bi][fi][yi][xi])
+                            << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi << std::endl
+                            << description.str();
+                    }
+    }
+};
+
+template <typename InputT, typename WeightsT, typename OutputT>
 class convolution_scale_random_test : public convolution_random_test_base<InputT, WeightsT, OutputT> {
 public:
     using parent = convolution_random_test_base<InputT, WeightsT, OutputT>;
@@ -8172,6 +8301,9 @@ class convolution_random_smoke_test : public testing::TestWithParam<convolution_
 using convolution_random_test_s8s8f32 = convolution_random_test_base<int8_t, int8_t, float>;
 using convolution_random_test_u8s8f32 = convolution_random_test_base<uint8_t, int8_t, float>;
 
+using convolution_random_test_fsv4_input_s8s8f32 = convolution_random_test_fsv4_input<int8_t, int8_t, float>;
+using convolution_random_test_fsv4_input_u8s8f32 = convolution_random_test_fsv4_input<uint8_t, int8_t, float>;
+
 using convolution_scale_random_test_s8s8f32 = convolution_scale_random_test<int8_t, int8_t, float>;
 using convolution_scale_random_test_u8s8f32 = convolution_scale_random_test<uint8_t, int8_t, float>;
 
@@ -8265,6 +8397,16 @@ TEST_P(convolution_random_smoke_test, u8s8f32_scale) {
     ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
 }
 
+TEST_P(convolution_random_smoke_test, s8s8f32_fsv4_input) {
+    convolution_random_test_fsv4_input_s8s8f32 test;
+    ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
+}
+
+TEST_P(convolution_random_smoke_test, u8s8f32_fsv4_input) {
+    convolution_random_test_fsv4_input_u8s8f32 test;
+    ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
+}
+
 INSTANTIATE_TEST_CASE_P(
     basic,
     convolution_random_smoke_test,
-- 
2.7.4