[IE CLDNN] Permute fused ops support (#642)
authorLukasz Debski <Lukasz.Debski@intel.com>
Thu, 4 Jun 2020 14:01:21 +0000 (16:01 +0200)
committerGitHub <noreply@github.com>
Thu, 4 Jun 2020 14:01:21 +0000 (17:01 +0300)
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/permute_ref.cl
inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
inference-engine/thirdparty/clDNN/src/permute.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp

index 24bb006..8c70ac8 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@ ParamsKey PermuteKernelRef::GetSupportedKey() const {
     k.EnableOutputDataType(Datatype::UINT8);
     k.EnableOutputDataType(Datatype::INT32);
     k.EnableOutputDataType(Datatype::INT64);
+    k.EnableDifferentTypes();
     k.EnableAllInputLayout();
     k.EnableAllOutputLayout();
     k.EnableTensorOffset();
@@ -66,6 +67,20 @@ JitConstants PermuteKernelRef::GetJitConstants(const permute_params& params) con
     jit.AddConstant(MakeJitConstant("IN_IDX", "INPUT0_GET_INDEX(" + input_order + ")"));
     jit.AddConstant(MakeJitConstant("OUT_IDX", "OUTPUT_GET_INDEX(" + output_order + ")"));
 
+    if (!params.fused_ops.empty()) {
+        if (out_idx.size() == 4)
+            std::swap(out_idx[2], out_idx[3]);
+        else if (out_idx.size() == 5)
+            std::swap(out_idx[2], out_idx[4]);
+        else if (out_idx.size() == 6) {
+            std::swap(out_idx[2], out_idx[5]);
+            std::swap(out_idx[3], out_idx[4]);
+        }
+
+        FusedOpsConfiguration conf = {"", out_idx, "input_var", GetUnitType(params), 1};
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+    }
+
     return jit;
 }
 
@@ -85,7 +100,7 @@ KernelsData PermuteKernelRef::GetKernelsData(const Params& params, const optiona
     kernel.workGroups.global = {in.X().v, in.Y().v * in.Z().v * in.W().v, in.Feature().v * in.Batch().v};
     kernel.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.workGroups.global, params.engineInfo);
     kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
-    kernel.arguments = GetArgsDesc(1, false, false);
+    kernel.arguments = GetArgsDesc(1, false, false, false, false, GetFusedPrimitiveInputsCount(params));
 
     kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE;
 
index dbbb321..58eb1da 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2019 Intel Corporation
+// Copyright (c) 2017-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 
 #include "include/include_all.cl"
 
-KERNEL (permute_ref)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output)
+KERNEL (permute_ref)(
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+    )
 {
     //gws(x, y * z * w, b*f)
     const uint gid_0 = get_global_id(1);
@@ -22,16 +28,23 @@ KERNEL (permute_ref)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output
     const uint y = gid_0;
 #elif INPUT0_DIMS == 5 && OUTPUT0_DIMS == 5
     const uint z = gid_0 / INPUT0_SIZE_Y;
-    const uint y = gid_0 % INPUT0_SIZE_Y;
+    const uint y = gid_0 % INPUT0_SIZE_Y;   
 #else
     const uint w = gid_0 / (INPUT0_SIZE_Y * INPUT0_SIZE_Z) % INPUT0_SIZE_W;
     const uint z = gid_0 / INPUT0_SIZE_Y % INPUT0_SIZE_Z;
     const uint y = gid_0 % INPUT0_SIZE_Y;
 #endif
-
+    
     const uint x = get_global_id(0);
     const uint f = (uint)get_global_id(2) % INPUT0_FEATURE_NUM;
     const uint b = (uint)get_global_id(2) / INPUT0_FEATURE_NUM;
+    
+    INPUT0_TYPE input_var = input[IN_IDX];
 
+#if HAS_FUSED_OPS
+    FUSED_OPS;
+    output[OUT_IDX] = FUSED_OPS_RESULT;
+#else
     output[OUT_IDX] = ACTIVATION(input[IN_IDX], ACTIVATION_PARAMS);
+#endif
 }
index ec5db1e..0fc778d 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2018-2019 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -360,6 +360,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
             should_fuse |= input_data.is_type<mvn>();
 
             should_fuse |= input_data.is_type<deconvolution>();
+            
+            should_fuse |= input_data.is_type<permute>();
 
             should_fuse |= input_data.is_type<activation>();
 
@@ -395,6 +397,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
             should_fuse |= input_data.is_type<mvn>() && mvn_supports_fusings(input_data.as<mvn>());
 
             should_fuse |= input_data.is_type<deconvolution>();
+            
+            should_fuse |= input_data.is_type<permute>();
 
             should_fuse |= input_data.is_type<activation>();
 
@@ -467,6 +471,9 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
                             input_data.get_dependency(0).get_output_layout().data_type == data_types::i8 ||
                             input_data.get_output_layout().data_type == out_layout.data_type);
 
+            should_fuse |= input_data.is_type<permute>() &&
+                           quantize_node.get_scale_shift_opt();
+
             if (!should_fuse)
                 return;
 
@@ -487,11 +494,11 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
 
             bool can_fuse_parent1 = (parent1->is_type<convolution>() && conv_supports_fusings(parent1->as<convolution>())) ||
                                     (parent1->is_type<mvn>() && mvn_supports_fusings(parent1->as<mvn>())) ||
-                                    (parent1->is_type<deconvolution>());
+                                    (parent1->is_type<deconvolution>()) || (parent1->is_type<permute>());
 
             bool can_fuse_parent2 = (parent2->is_type<convolution>() && conv_supports_fusings(parent2->as<convolution>())) ||
                                     (parent2->is_type<mvn>() && mvn_supports_fusings(parent2->as<mvn>())) ||
-                                    (parent2->is_type<deconvolution>());
+                                    (parent2->is_type<deconvolution>()) || (parent2->is_type<permute>());
 
             std::vector<bool> can_fuse_parents = { can_fuse_parent1, can_fuse_parent2 };
 
index c40a713..8f7e708 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -45,6 +45,10 @@ layout permute_inst::calc_output_layout(permute_node const& node) {
     auto input_size = tensor(output_sizes);
     auto op = node.get_primitive()->output_padding;
 
+    if (node.has_fused_primitives()) {
+        input_layout.data_type = node.get_fused_output_layout().data_type;
+    }
+
     return layout(input_layout.data_type, input_layout.format, input_size, op);
 }
 
index 1d81204..86dd962 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2020 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@
 #include "api/resample.hpp"
 #include "api/mvn.hpp"
 #include "api/deconvolution.hpp"
+#include "api/permute.hpp"
 
 #include "test_utils/test_utils.h"
 
@@ -4044,3 +4045,386 @@ INSTANTIATE_TEST_CASE_P(DISABLED_fusings_gpu,
                             pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
                             pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::average, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
                      }), );
+
+/* ------------------------------------------------------------------------------------------------------------ */
+/* ---------------------------------------- PERMUTE FUSE cases -------------------------------------------------- */
+/* ------------------------------------------------------------------------------------------------------------ */
+struct permute_params {
+    tensor in_shape;
+    tensor out_shape;
+    std::vector<uint16_t> permute_order;
+    tensor eltw_in_shape;
+    data_types data_type;
+    format input_format;
+    data_types default_type;
+    format default_format;
+    size_t expected_fused_primitives;
+    size_t expected_not_fused_primitives;
+};
+
+#define CASE_PERMUTE_F32_0 {1, 16, 2, 2}, {1, 16, 2, 2}, {0, 1, 2, 3}, tensor{0}, data_types::f32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F32_1 {1, 15, 16, 16}, {1, 15, 16, 16}, {0, 1, 2, 3}, tensor{0}, data_types::f32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F32_2 {1, 8, 16, 16}, {16, 16, 8, 1}, {3, 2, 1, 0}, tensor{0}, data_types::f32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F32_3 {1, 1, 3, 4}, {1, 3, 4, 1}, {1, 2, 3, 0}, tensor{0}, data_types::f32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F32_4 {2, 16, 16, 16}, {2, 16, 16, 16}, {0, 1, 2, 3}, tensor{0}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F32_5 {1, 32, 4, 5}, {32, 4, 5, 1}, {1, 2, 3, 0}, tensor{0}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F32_6 {1, 16, 4, 5}, {5, 16, 4, 1}, {3, 1, 2, 0}, tensor{0}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F32_7 {1, 16, 1, 1}, {1, 1, 1, 16}, {2, 3, 0, 1}, tensor{0}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+
+#define CASE_PERMUTE_F16_0 {1, 16, 4, 5}, {1, 16, 4, 5}, {0, 1, 2, 3}, tensor{0}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F16_1 {2, 16, 4, 5}, {16, 4, 5, 2}, {1, 2, 3, 0}, tensor{0}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F16_2 {1, 32, 2, 3}, {2, 3, 32, 1}, {2, 3, 1, 0}, tensor{0}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F16_3 {3, 16, 1, 1}, {1, 1, 16, 3}, {3, 2, 1, 0}, tensor{0}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F16_4 {2, 15, 4, 5}, {4, 2, 5, 15}, {2, 0, 3, 1}, tensor{0}, data_types::f16, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F16_5 {1, 15, 1, 2}, {15, 2, 1, 1}, {1, 3, 2, 0}, tensor{0}, data_types::f16, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F16_6 {1, 15, 4, 4}, {4, 4, 1, 15}, {2, 3, 0, 1}, tensor{0}, data_types::f16, format::bfyx, data_types::f32, format::bfyx
+
+#define CASE_PERMUTE_S8_0 {1, 15, 4, 5}, {1, 15, 4, 5}, {0, 1, 2, 3}, tensor{0}, data_types::i8, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_S8_1 {1, 15, 4, 5}, {5, 4, 15, 1}, {3, 2, 1, 0}, tensor{0}, data_types::i8, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_S8_2 {1, 16, 1, 2}, {1, 1, 16, 2}, {2, 0, 1, 3}, tensor{0}, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_S8_3 {1, 16, 2, 2}, {2, 2, 16, 1}, {2, 3, 1, 0}, tensor{0}, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_U8_0 {1, 15, 4, 5}, {15, 5, 1, 4}, {1, 3, 0, 2}, tensor{0}, data_types::u8, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_U8_1 {1, 15, 16, 16}, {15, 16, 1, 16}, {1, 2, 0, 3}, tensor{0}, data_types::u8, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_U8_2 {1, 32, 5, 4}, {1, 32, 5, 4}, {0, 1, 2, 3}, tensor{0}, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_U8_3 {1, 16, 4, 5}, {5, 4, 16, 1}, {3, 2, 1, 0}, tensor{0}, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+
+// 3d
+#define CASE_PERMUTE_F32_3D_0 {1, 15, 4, 4, 5}, {1, 15, 4, 4, 5}, {0, 1, 2, 3, 4}, tensor{0}, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_F32_3D_1 {2, 15, 2, 3, 4}, {15, 2, 3, 4, 2}, {1, 2, 3, 4, 0}, tensor{0}, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_F32_3D_2 {2, 16, 4, 4, 5}, {4, 2, 4, 5, 16}, {3, 0, 2, 4, 1}, tensor{0}, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_F32_3D_3 {1, 32, 4, 2, 2}, {2, 2, 32, 1, 4}, {4, 3, 1, 0, 2}, tensor{0}, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_F32_3D_4 {1, 16, 1, 1, 1}, {1, 1, 1, 16, 1}, {2, 4, 0, 1, 3}, tensor{0}, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
+
+#define CASE_PERMUTE_F16_3D_0 {1, 15, 4, 4, 5}, {1, 15, 4, 4, 5}, {0, 1, 2, 3, 4}, tensor{0}, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_F16_3D_1 {2, 15, 4, 3, 4}, {4, 4, 2, 15, 3}, {2, 4, 0, 1, 3}, tensor{0}, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_F16_3D_2 {2, 16, 4, 4, 3}, {2, 4, 3, 16, 4}, {0, 3, 4, 1, 2}, tensor{0}, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_F16_3D_3 {1, 32, 4, 2, 1}, {2, 32, 4, 1, 1}, {3, 1, 2, 4, 0}, tensor{0}, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_F16_3D_4 {16, 16, 1, 1, 1},{1, 16, 1, 1, 16},{4, 0, 3, 2, 1}, tensor{0}, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
+
+#define CASE_PERMUTE_S8_3D_0 {1, 15, 4, 4, 5}, {1, 15, 4, 4, 5}, {0, 1, 2, 3, 4}, tensor{0}, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_S8_3D_1 {2, 15, 4, 3, 4}, {4, 4, 15, 2, 3}, {4, 2, 1, 0, 3}, tensor{0}, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_S8_3D_2 {2, 16, 4, 4, 3}, {2, 4, 3, 16, 4}, {0, 3, 4, 1, 2}, tensor{0}, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_S8_3D_3 {1, 32, 4, 2, 1}, {2, 32, 4, 1, 1}, {3, 1, 2, 4, 0}, tensor{0}, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_U8_3D_0 {16, 16, 1, 1, 1}, {1, 1, 16, 16, 1}, {2, 4, 0, 1, 3}, tensor{0}, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_U8_3D_1 {16, 16, 1, 1, 1}, {1, 1, 1, 16, 16}, {4, 3, 2, 1, 0}, tensor{0}, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_U8_3D_2 {2, 16, 4, 4, 3}, {4, 2, 4, 3, 16}, {3, 0, 2, 4, 1}, tensor{0}, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_U8_3D_3 {1, 32, 4, 2, 1}, {1, 2, 32, 1, 4}, {4, 3, 1, 0, 2}, tensor{0}, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
+
+class PermuteFusingTest : public ::BaseFusingTest<permute_params> {
+public:
+
+    void execute(permute_params& p) {
+        auto input_prim = get_mem(get_input_layout(p));
+        network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+        network network_fused(this->engine, this->topology_fused, bo_fused);
+        network_fused.set_input_data("input", input_prim);
+        network_not_fused.set_input_data("input", input_prim);
+
+        compare(network_not_fused, network_fused, p);
+    }
+
+    layout get_input_layout(permute_params& p) {
+        return layout{ p.data_type, p.input_format, p.in_shape, padding{} };
+    }
+
+    layout get_per_channel_layout(permute_params& p) {
+        return layout{ p.default_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} };
+    }
+};
+
+class permute_activation_scale_eltwise: public PermuteFusingTest {};
+TEST_P(permute_activation_scale_eltwise, basic) {
+        auto p = GetParam();
+
+        create_topologies(
+            input_layout("input", get_input_layout(p)),
+            data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.out_shape})),
+            data("scale_data", get_mem(get_per_channel_layout(p), 5e-1f)),
+            permute("permute", "input", p.permute_order),
+            scale("scale", "permute", "scale_data"),
+            activation("actv", "scale", activation_func::relu),
+            eltwise("eltwise", {"actv", "eltwise_data"}, eltwise_mode::sum, p.data_type),
+            reorder("reorder_bfyx", "eltwise", p.default_format, p.default_type)
+        );
+
+        tolerance = 1e-5f;
+        execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu, permute_activation_scale_eltwise,
+                        ::testing::ValuesIn(std::vector<permute_params> {
+                            permute_params{CASE_PERMUTE_F32_0, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_1, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_2, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_3, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_4, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_5, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_6, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_7, 2, 5},
+                            
+                            permute_params{CASE_PERMUTE_F16_0, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_1, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_2, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_3, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_4, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_5, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_6, 2, 5},
+                            
+                            permute_params{CASE_PERMUTE_S8_0, 2, 5},
+                            permute_params{CASE_PERMUTE_S8_1, 2, 5},
+                            permute_params{CASE_PERMUTE_S8_2, 2, 5},
+                            permute_params{CASE_PERMUTE_S8_3, 2, 5},
+                                                                 
+                            permute_params{CASE_PERMUTE_U8_0, 2, 5},
+                            permute_params{CASE_PERMUTE_U8_1, 2, 5},
+                            permute_params{CASE_PERMUTE_U8_2, 2, 5},
+                            permute_params{CASE_PERMUTE_U8_3, 2, 5},
+                            
+                            permute_params{CASE_PERMUTE_F32_3D_0, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_3D_1, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_3D_2, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_3D_3, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_3D_4, 2, 5},
+                                
+                            permute_params{CASE_PERMUTE_F16_3D_0, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_3D_1, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_3D_2, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_3D_3, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_3D_4, 2, 5},
+                            
+                            permute_params{CASE_PERMUTE_S8_3D_0, 2, 5},
+                            permute_params{CASE_PERMUTE_S8_3D_1, 2, 5},
+                            permute_params{CASE_PERMUTE_S8_3D_2, 2, 5},
+                            permute_params{CASE_PERMUTE_S8_3D_3, 2, 5},
+                                                                 
+                            permute_params{CASE_PERMUTE_U8_3D_0, 2, 5},
+                            permute_params{CASE_PERMUTE_U8_3D_1, 2, 5},
+                            permute_params{CASE_PERMUTE_U8_3D_2, 2, 5},
+                            permute_params{CASE_PERMUTE_U8_3D_3, 2, 5},
+                        }), );
+
+class permute_scale_eltwise_quant_u8: public PermuteFusingTest {};
+TEST_P(permute_scale_eltwise_quant_u8, vector_ops) {
+        auto p = GetParam();
+        create_topologies(
+        input_layout("input", get_input_layout(p)),
+        data("scale_data", get_mem(get_per_channel_layout(p))),
+        data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.out_shape})),
+        data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
+        data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
+        data("out_lo", get_mem(get_single_element_layout(p), 0)),
+        data("out_hi", get_mem(get_single_element_layout(p), 255)),
+        permute("permute", "input", p.permute_order),
+        scale("scale1", "permute", "scale_data"),
+        eltwise("eltwise", "scale1", "eltwise_data", eltwise_mode::sum),
+        quantize("quant", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8),
+        reorder("reorder_bfyx", "quant", p.default_format, p.default_type)
+        );
+
+    tolerance = 1.f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu, permute_scale_eltwise_quant_u8,
+                        ::testing::ValuesIn(std::vector<permute_params> {
+                            permute_params{CASE_PERMUTE_F32_0, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_1, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_2, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_3, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_4, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_5, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_6, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_7, 2, 5},
+                                                                  
+                            permute_params{CASE_PERMUTE_F16_0, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_1, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_2, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_3, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_4, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_5, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_6, 2, 5},
+
+                            permute_params{CASE_PERMUTE_S8_0, 2, 5},
+                            permute_params{CASE_PERMUTE_S8_1, 2, 5},
+                            permute_params{CASE_PERMUTE_S8_2, 2, 5},
+                            permute_params{CASE_PERMUTE_S8_3, 2, 5},
+                                                                 
+                            permute_params{CASE_PERMUTE_U8_0, 2, 5},
+                            permute_params{CASE_PERMUTE_U8_1, 2, 5},
+                            permute_params{CASE_PERMUTE_U8_2, 2, 5},
+                            permute_params{CASE_PERMUTE_U8_3, 2, 5},
+                            
+                            permute_params{CASE_PERMUTE_F32_3D_0, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_3D_1, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_3D_2, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_3D_3, 2, 5},
+                            permute_params{CASE_PERMUTE_F32_3D_4, 2, 5},
+                                                                     
+                            permute_params{CASE_PERMUTE_F16_3D_0, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_3D_1, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_3D_2, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_3D_3, 2, 5},
+                            permute_params{CASE_PERMUTE_F16_3D_4, 2, 5},
+
+                            permute_params{CASE_PERMUTE_S8_3D_0, 2, 5},
+                            permute_params{CASE_PERMUTE_S8_3D_1, 2, 5},
+                            permute_params{CASE_PERMUTE_S8_3D_2, 2, 5},
+                            permute_params{CASE_PERMUTE_S8_3D_3, 2, 5},
+                                                                    
+                            permute_params{CASE_PERMUTE_U8_3D_0, 2, 5},
+                            permute_params{CASE_PERMUTE_U8_3D_1, 2, 5},
+                            permute_params{CASE_PERMUTE_U8_3D_2, 2, 5},
+                            permute_params{CASE_PERMUTE_U8_3D_3, 2, 5},
+                        }), );
+
+class permute_scale_actv_eltw_scale_actv_quant_i8: public PermuteFusingTest {};
+TEST_P(permute_scale_actv_eltw_scale_actv_quant_i8, basic) {
+    auto p = GetParam();
+    create_topologies(
+        input_layout("input", get_input_layout(p)),
+        data("scale1_data", get_mem(get_per_channel_layout(p), 1e-1f)),
+        data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
+        data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
+        data("out_lo", get_mem(get_single_element_layout(p), -127)),
+        data("out_hi", get_mem(get_single_element_layout(p), 127)),
+        data("eltw_data", get_mem(layout(p.data_type, p.input_format, p.out_shape))),
+        data("scale2_data", get_mem(get_per_channel_layout(p), 1e-1f)),
+        permute("permute", "input", p.permute_order),
+        scale("scale1", "permute", "scale1_data"),
+        activation("actv1", "scale1", activation_func::relu),
+        eltwise("eltw", {"actv1", "eltw_data"}, eltwise_mode::sum, p.data_type),
+        scale("scale2", "eltw", "scale2_data"),
+        activation("actv2", "scale2", activation_func::relu),
+        quantize("quant", "actv2", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
+        reorder("out", "quant", p.default_format, p.default_type)
+    );
+
+    tolerance = 1.f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu, permute_scale_actv_eltw_scale_actv_quant_i8,
+                        ::testing::ValuesIn(std::vector<permute_params> {
+                            permute_params{CASE_PERMUTE_F32_0, 2, 8},
+                            permute_params{CASE_PERMUTE_F32_1, 2, 8},
+                            permute_params{CASE_PERMUTE_F32_2, 2, 8},
+                            permute_params{CASE_PERMUTE_F32_3, 2, 8},
+                            permute_params{CASE_PERMUTE_F32_4, 2, 8},
+                            permute_params{CASE_PERMUTE_F32_5, 2, 8},
+                            permute_params{CASE_PERMUTE_F32_6, 2, 8},
+                            permute_params{CASE_PERMUTE_F32_7, 2, 8},
+
+                            permute_params{CASE_PERMUTE_F16_0, 2, 8},
+                            permute_params{CASE_PERMUTE_F16_1, 2, 8},
+                            permute_params{CASE_PERMUTE_F16_2, 2, 8},
+                            permute_params{CASE_PERMUTE_F16_3, 2, 8},
+                            permute_params{CASE_PERMUTE_F16_4, 2, 8},
+                            permute_params{CASE_PERMUTE_F16_5, 2, 8},
+                            permute_params{CASE_PERMUTE_F16_6, 2, 8},
+                                
+                            permute_params{CASE_PERMUTE_S8_0, 2, 8},
+                            permute_params{CASE_PERMUTE_S8_1, 2, 8},
+                            permute_params{CASE_PERMUTE_S8_2, 2, 8},
+                            permute_params{CASE_PERMUTE_S8_3, 2, 8},
+                                                                 
+                            permute_params{CASE_PERMUTE_U8_0, 2, 8},
+                            permute_params{CASE_PERMUTE_U8_1, 2, 8},
+                            permute_params{CASE_PERMUTE_U8_2, 2, 8},
+                            permute_params{CASE_PERMUTE_U8_3, 2, 8},
+                             
+                            permute_params{CASE_PERMUTE_F32_3D_0, 2, 8},
+                            permute_params{CASE_PERMUTE_F32_3D_1, 2, 8}, 
+                            permute_params{CASE_PERMUTE_F32_3D_2, 2, 8},
+                            permute_params{CASE_PERMUTE_F32_3D_3, 2, 8},
+                            permute_params{CASE_PERMUTE_F32_3D_4, 2, 8},
+
+                            permute_params{CASE_PERMUTE_F16_3D_0, 2, 8},
+                            permute_params{CASE_PERMUTE_F16_3D_1, 2, 8},
+                            permute_params{CASE_PERMUTE_F16_3D_2, 2, 8},
+                            permute_params{CASE_PERMUTE_F16_3D_3, 2, 8},
+                            permute_params{CASE_PERMUTE_F16_3D_4, 2, 8},
+
+                            permute_params{CASE_PERMUTE_S8_3D_0, 2, 8},
+                            permute_params{CASE_PERMUTE_S8_3D_1, 2, 8},
+                            permute_params{CASE_PERMUTE_S8_3D_2, 2, 8},
+                            permute_params{CASE_PERMUTE_S8_3D_3, 2, 8},
+                                                                    
+                            permute_params{CASE_PERMUTE_U8_3D_0, 2, 8},
+                            permute_params{CASE_PERMUTE_U8_3D_1, 2, 8},
+                            permute_params{CASE_PERMUTE_U8_3D_2, 2, 8},
+                            permute_params{CASE_PERMUTE_U8_3D_3, 2, 8},
+                        }), );
+
+class permute_scale_eltwise_actv_scale_actv: public PermuteFusingTest {};
+TEST_P(permute_scale_eltwise_actv_scale_actv, basic) {
+    auto p = GetParam();
+
+        create_topologies(
+            input_layout("input", get_input_layout(p)),
+            data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.out_shape})),
+            data("scale_data1", get_mem(get_per_channel_layout(p), 1e-1f)),
+            data("scale_data2", get_mem(get_per_channel_layout(p), 1e-1f)),
+            permute("permute", "input", p.permute_order),
+            scale("scale1", "permute", "scale_data1"),
+            activation("actv1", "scale1", activation_func::relu),
+            eltwise("eltwise", {"actv1", "eltwise_data"}, eltwise_mode::sum, p.default_type),
+            scale("scale2", "eltwise", "scale_data2"),
+            activation("actv2", "scale2", activation_func::relu),
+            reorder("reorder_bfyx", "actv2", p.default_format, p.default_type)
+        );
+
+        tolerance = 1e-5f;
+        execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu, permute_scale_eltwise_actv_scale_actv,
+                        ::testing::ValuesIn(std::vector<permute_params> {
+                            permute_params{CASE_PERMUTE_F32_0, 2, 7},
+                            permute_params{CASE_PERMUTE_F32_1, 2, 7},
+                            permute_params{CASE_PERMUTE_F32_2, 2, 7},
+                            permute_params{CASE_PERMUTE_F32_3, 2, 7},
+                            permute_params{CASE_PERMUTE_F32_4, 2, 7},
+                            permute_params{CASE_PERMUTE_F32_5, 2, 7},
+                            permute_params{CASE_PERMUTE_F32_6, 2, 7},
+                            permute_params{CASE_PERMUTE_F32_7, 2, 7},
+                                                                  
+                            permute_params{CASE_PERMUTE_F16_0, 2, 7},
+                            permute_params{CASE_PERMUTE_F16_1, 2, 7},
+                            permute_params{CASE_PERMUTE_F16_2, 2, 7},
+                            permute_params{CASE_PERMUTE_F16_3, 2, 7},
+                            permute_params{CASE_PERMUTE_F16_4, 2, 7},
+                            permute_params{CASE_PERMUTE_F16_5, 2, 7},
+                            permute_params{CASE_PERMUTE_F16_6, 2, 7},
+                            
+                            permute_params{CASE_PERMUTE_S8_0, 2, 7},
+                            permute_params{CASE_PERMUTE_S8_1, 2, 7},
+                            permute_params{CASE_PERMUTE_S8_2, 2, 7},
+                            permute_params{CASE_PERMUTE_S8_3, 2, 7},
+                                                                 
+                            permute_params{CASE_PERMUTE_U8_0, 2, 7},
+                            permute_params{CASE_PERMUTE_U8_1, 2, 7},
+                            permute_params{CASE_PERMUTE_U8_2, 2, 7},
+                            permute_params{CASE_PERMUTE_U8_3, 2, 7},
+                            
+                            permute_params{CASE_PERMUTE_F32_3D_0, 2, 7},
+                            permute_params{CASE_PERMUTE_F32_3D_1, 2, 7},
+                            permute_params{CASE_PERMUTE_F32_3D_2, 2, 7},
+                            permute_params{CASE_PERMUTE_F32_3D_3, 2, 7},
+                            permute_params{CASE_PERMUTE_F32_3D_4, 2, 7},
+                                                                     
+                            permute_params{CASE_PERMUTE_F16_3D_0, 2, 7},
+                            permute_params{CASE_PERMUTE_F16_3D_1, 2, 7},
+                            permute_params{CASE_PERMUTE_F16_3D_2, 2, 7},
+                            permute_params{CASE_PERMUTE_F16_3D_3, 2, 7},
+                            permute_params{CASE_PERMUTE_F16_3D_4, 2, 7},
+                            
+                            permute_params{CASE_PERMUTE_S8_3D_0, 2, 7},
+                            permute_params{CASE_PERMUTE_S8_3D_1, 2, 7},
+                            permute_params{CASE_PERMUTE_S8_3D_2, 2, 7},
+                            permute_params{CASE_PERMUTE_S8_3D_3, 2, 7},
+                                                                    
+                            permute_params{CASE_PERMUTE_U8_3D_0, 2, 7},
+                            permute_params{CASE_PERMUTE_U8_3D_1, 2, 7},
+                            permute_params{CASE_PERMUTE_U8_3D_2, 2, 7},
+                            permute_params{CASE_PERMUTE_U8_3D_3, 2, 7},
+                        }), );