[IE CLDNN] added fusing support for ref and b_fs_yx_fsv16 implementations (#1692)
authorSergey Nesterov <sergei.n.nesterov@gmail.com>
Wed, 19 Aug 2020 06:18:00 +0000 (08:18 +0200)
committerGitHub <noreply@github.com>
Wed, 19 Aug 2020 06:18:00 +0000 (09:18 +0300)
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.h
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.h
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv16.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/generic_eltwise_ref.cl
inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp

index 38c69c3..c3fb2d8 100644 (file)
@@ -26,8 +26,11 @@ ParamsKey EltwiseKernel_b_fs_yx_fsv16::GetSupportedKey() const {
     k.EnableInputDataType(Datatype::F32);
     k.EnableOutputDataType(Datatype::F16);
     k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableDifferentTypes();
     k.EnableBatching();
     k.EnableTensorPitches();
     k.EnableTensorOffset();
@@ -68,7 +71,7 @@ JitConstants EltwiseKernel_b_fs_yx_fsv16::MakeLoadJitConstants(const eltwise_par
                     if (params.inputs[input.index].LogicalSize() == params.output.Feature().v &&
                         params.inputs[input.index].LogicalSize() == params.inputs[input.index].Feature().v) {
                         jit.AddConstant(MakeJitConstant(name,
-                                                        "UNIT_BLOCK_READ(input" + std::to_string(input.index) +
+                                                        "BLOCK_READN(INPUT" + std::to_string(input.index) + "_TYPE, 1, input" + std::to_string(input.index) +
                                                         ", INPUT"+std::to_string(input.index)+"_GET_INDEX(b, f_block*16, y, x))"));
                     } else if (params.inputs[input.index].LogicalSize() == 1) {
                         jit.AddConstant(MakeJitConstant(name,
@@ -137,6 +140,16 @@ JitConstants EltwiseKernel_b_fs_yx_fsv16::GetJitConstants(const eltwise_params&
     if (params.output.Feature().v % 16 != 0)
         jit.AddConstant(MakeJitConstant("LEFTOVERS", params.output.Feature().v % 16));
 
+    if (!params.fused_ops.empty()) {
+        kernel_selector::Datatype input_dt = GetAccumulatorType(params);
+
+        FusedOpsConfiguration conf = {"", {"b", "f_block", "y", "x"}, "res", input_dt, blockSize};
+        conf.load_type = FusedOpsConfiguration::LoadType::LT_ALIGNED_READ;
+        conf.vec_axis = Tensor::DataChannelName::X;
+
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+    }
+
     return jit;
 }
 
@@ -163,6 +176,12 @@ bool EltwiseKernel_b_fs_yx_fsv16::Validate(const Params& params, const optional_
 
     auto input0 = ewParams.inputs[0];
 
+    for (size_t i = 1; i < ewParams.inputs.size(); i++) {
+        if (input0.GetDType() != ewParams.inputs[i].GetDType()) {
+            return false;
+        }
+    }
+
     // Check that padding before features doesn't miss-align the blocks
     auto feature_block_size = 16;
     if (input0.Feature().pad.before % feature_block_size != 0 || output.Feature().pad.before % feature_block_size != 0) {
@@ -222,7 +241,8 @@ KernelsData EltwiseKernel_b_fs_yx_fsv16::GetKernelsData(const Params& params, co
     kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
     kernel.arguments = GetArgsDesc((uint32_t)newParams.inputs.size(),
                                    false,
-                                   false);
+                                   false,
+                                   GetFusedPrimitiveInputsCount(params));
 
     kd.estimatedTime = runInfo.efficiency;
 
index e169d5f..1058f00 100644 (file)
@@ -24,6 +24,9 @@ public:
 
     KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
     ParamsKey GetSupportedKey() const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::QUANTIZE };
+    }
 
 protected:
     bool Validate(const Params& p, const optional_params& o) const override;
index dc70249..f7fc37f 100644 (file)
@@ -109,6 +109,12 @@ bool EltwiseKernelBase::Validate(const Params& p, const optional_params& o) cons
         }
     }
 
+    const eltwise_params& orgParams = static_cast<const eltwise_params&>(p);
+    for (auto& fused_op : orgParams.fused_ops) {
+        if (!IsFusedPrimitiveSupported(fused_op))
+            return false;
+    }
+
     return true;
 }
 
@@ -606,7 +612,8 @@ KernelsData EltwiseKernelBase::GetCommonKernelsData(const Params& params, const
     kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
     kernel.arguments = GetArgsDesc((uint32_t)newParams.inputs.size(),
                                    false,
-                                   false);
+                                   false,
+                                   GetFusedPrimitiveInputsCount(params));
 
     kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE;
 
index f8021ff..da116fa 100644 (file)
@@ -54,4 +54,26 @@ bool EltwiseKernelRef::Validate(const Params& p, const optional_params& o) const
 KernelsData EltwiseKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
     return GetCommonKernelsData(params, options);
 }
+
+JitConstants EltwiseKernelRef::GetJitConstants(const eltwise_params& params) const {
+    auto jit = EltwiseKernelBase::GetJitConstants(params);
+
+    if (!params.fused_ops.empty()) {
+        kernel_selector::Datatype input_dt = GetAccumulatorType(params);
+
+        std::vector<std::string> idx_order;
+        if (DataTensor::ChannelsCount(params.output.GetLayout()) == 4) {
+            idx_order = {"d4", "d3", "d2", "d1"};
+        } else if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
+            idx_order = {"d5", "d4", "d3", "d2", "d1"};
+        } else if (DataTensor::ChannelsCount(params.output.GetLayout()) == 6) {
+            idx_order = {"d6", "d5", "d4", "d3", "d2", "d1"};
+        }
+
+        FusedOpsConfiguration conf = {"", idx_order, "res", input_dt, 1};
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+    }
+
+    return jit;
+}
 }  // namespace kernel_selector
index fc2b672..82aac85 100644 (file)
@@ -25,6 +25,11 @@ public:
 
     KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
     ParamsKey GetSupportedKey() const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::QUANTIZE };
+    }
+
+    JitConstants GetJitConstants(const eltwise_params& params) const;
 
 protected:
     bool Validate(const Params& p, const optional_params& o) const override;
index 8a63521..9be38f0 100644 (file)
 */
 
 #include "include/include_all.cl"
-#include "include/unit_type.cl"
+#include "include/common.cl"
+#include "include/data_types.cl"
 
 #define FEATURE_SLICE_SIZE 16
 
+#define OUTPUT_TYPE_BLOCK               MAKE_VECTOR_TYPE(OUTPUT_TYPE, BLOCK_SIZE)
+#define TO_OUTPUT_TYPE_BLOCK(val)       CAT(convert_, OUTPUT_TYPE_BLOCK)(val)
+
 #if BLOCK_SIZE != 1
-    #define READ_FUNC(ptr, offset) CAT(UNIT_BLOCK_READ, BLOCK_SIZE)(ptr, offset)
-    #define WRITE_FUNC(ptr, offset, val) CAT(UNIT_BLOCK_WRITE, BLOCK_SIZE)(ptr, offset, val)
+    #define READ_FUNC(ptr, offset) CAT(DT_INPUT_BLOCK_READ, BLOCK_SIZE)(ptr, offset)
+    #define WRITE_FUNC(ptr, offset, val) CAT(DT_OUTPUT_BLOCK_WRITE, BLOCK_SIZE)(ptr, offset, val)
 #else
-    #define READ_FUNC(ptr, offset) UNIT_BLOCK_READ(ptr, offset)
-    #define WRITE_FUNC(ptr, offset, val) UNIT_BLOCK_WRITE(ptr, offset, val)
+    #define READ_FUNC(ptr, offset) DT_INPUT_BLOCK_READ(ptr, offset)
+    #define WRITE_FUNC(ptr, offset, val) DT_OUTPUT_BLOCK_WRITE(ptr, offset, val)
 #endif
 
 __attribute__((intel_reqd_sub_group_size(FEATURE_SLICE_SIZE)))
 KERNEL(eltwise_b_fs_yx_fsv16)(INPUTS_DECLS
-                              __global UNIT_TYPE* output)
+                              __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+, FUSED_OPS_DECLS
+#endif
+)
 {
     const uint f_block = get_group_id(0);
     const uint y = (uint)get_global_id(1) / BLOCKS_COUNT;
@@ -58,22 +66,33 @@ KERNEL(eltwise_b_fs_yx_fsv16)(INPUTS_DECLS
 
     DO_ELTWISE
 
+#if HAS_FUSED_OPS
+    FUSED_OPS;
+    OUTPUT_TYPE_BLOCK out = TO_OUTPUT_TYPE_BLOCK(FUSED_OPS_RESULT);
+#else
+#if BLOCK_SIZE != 1
+    OUTPUT_TYPE_BLOCK out = ACTIVATION_TYPED(TO_OUTPUT_TYPE_BLOCK(res), ACTIVATION_PARAMS_TYPED);
+#else
+    OUTPUT_TYPE out = ACTIVATION_TYPED(TO_OUTPUT_TYPE(res), ACTIVATION_PARAMS_TYPED);
+#endif
+#endif
+
 #ifdef LEFTOVERS
     if ((f_block + 1) * FEATURE_SLICE_SIZE > OUTPUT_FEATURE_NUM) {
         const uint sglid = get_sub_group_local_id();
         if (sglid < OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE) {
             for (uint block_x = 0; block_x < BLOCK_SIZE; block_x++) {
 #if BLOCK_SIZE != 1
-                output[output_offset + block_x * output_x_pitch + sglid] = ACTIVATION_TYPED(res[block_x], ACTIVATION_PARAMS_TYPED);
+                output[output_offset + block_x * output_x_pitch + sglid] = out[block_x];
 #else
-                output[output_offset + block_x * output_x_pitch + sglid] = ACTIVATION_TYPED(res, ACTIVATION_PARAMS_TYPED);
+                output[output_offset + block_x * output_x_pitch + sglid] = out;
 #endif
             }
         }
     } else
 #endif
     {
-        WRITE_FUNC(output, output_offset, ACTIVATION_TYPED(res, ACTIVATION_PARAMS_TYPED));
+        WRITE_FUNC(output, output_offset, out);
     }
 
 }
index 1fe9453..b998783 100644 (file)
 
 KERNEL(eltwise)(
     INPUTS_DECLS
-    __global OUTPUT_TYPE* output)
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+)
 {
 
 #if OUTPUT_DIMS == 6 // 4D spatial
@@ -112,9 +116,18 @@ KERNEL(eltwise)(
 
     DO_ELTWISE;
 
+#if HAS_FUSED_OPS
+    FUSED_OPS;
+    OUTPUT_TYPE out = FUSED_OPS_RESULT;
+#elif QUANTIZATION_TERM && !OUTPUT_IS_FP
+    OUTPUT_TYPE out = ACTIVATION(TO_OUTPUT_TYPE(res), ACTIVATION_PARAMS);
+#else
+    OUTPUT_TYPE out = ACTIVATION_TYPED(TO_OUTPUT_TYPE(res), ACTIVATION_PARAMS_TYPED);
+#endif
+
 #if QUANTIZATION_TERM && !OUTPUT_IS_FP
-    output[output_offset] = TO_OUTPUT_TYPE_SAT(ACTIVATION(res, ACTIVATION_PARAMS));
+    output[output_offset] = TO_OUTPUT_TYPE_SAT(out);
 #else
-    output[output_offset] = ACTIVATION_TYPED(res, ACTIVATION_PARAMS_TYPED);
+    output[output_offset] = out;
 #endif
 }
index ed33cbd..86bc207 100644 (file)
@@ -541,6 +541,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
 
             should_fuse |= input_data.is_type<space_to_batch>() && quantize_node.get_scale_shift_opt();
 
+            should_fuse |= input_data.is_type<eltwise>() && quantize_node.get_scale_shift_opt();
+
             if (!should_fuse)
                 return;
 
index 4278d78..f6ca475 100644 (file)
@@ -1090,8 +1090,8 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_scale_activation_eltwise_fp32_quantize
                                 conv_eltw_test_params{CASE_CONV_ELTW_FP32_4, 2, 6},
                                 conv_eltw_test_params{CASE_CONV_ELTW_FP32_5, 3, 6},
                                 conv_eltw_test_params{CASE_CONV_ELTW_FP32_6, 3, 6},
-                                conv_eltw_test_params{CASE_CONV_ELTW_FP32_7, 4, 6},
-                                conv_eltw_test_params{CASE_CONV_ELTW_FP32_8, 4, 6},
+                                conv_eltw_test_params{CASE_CONV_ELTW_FP32_7, 3, 6},
+                                conv_eltw_test_params{CASE_CONV_ELTW_FP32_8, 3, 6},
                         }), );
 
 
@@ -3041,24 +3041,24 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, mvn_scale_activation_eltwise_fp32_quantize_
         mvn_test_params{ CASE_MVN_I8_4, 2, 6 },
         mvn_test_params{ CASE_MVN_I8_5, 2, 6 },
         mvn_test_params{ CASE_MVN_I8_6, 2, 6 },
-        mvn_test_params{ CASE_MVN_I8_7, 4, 6 },
+        mvn_test_params{ CASE_MVN_I8_7, 3, 6 },
         mvn_test_params{ CASE_MVN_3D_I8_1, 2, 6 },
         mvn_test_params{ CASE_MVN_3D_I8_2, 2, 6 },
         mvn_test_params{ CASE_MVN_3D_I8_3, 2, 6 },
         mvn_test_params{ CASE_MVN_3D_I8_4, 2, 6 },
-        mvn_test_params{ CASE_MVN_3D_I8_5, 4, 6 },
+        mvn_test_params{ CASE_MVN_3D_I8_5, 3, 6 },
         mvn_test_params{ CASE_MVN_U8_1, 2, 6 },
         mvn_test_params{ CASE_MVN_U8_2, 2, 6 },
         mvn_test_params{ CASE_MVN_U8_3, 2, 6 },
         mvn_test_params{ CASE_MVN_U8_4, 2, 6 },
         mvn_test_params{ CASE_MVN_U8_5, 2, 6 },
         mvn_test_params{ CASE_MVN_U8_6, 2, 6 },
-        mvn_test_params{ CASE_MVN_U8_7, 4, 6 },
+        mvn_test_params{ CASE_MVN_U8_7, 3, 6 },
         mvn_test_params{ CASE_MVN_3D_U8_1, 2, 6 },
         mvn_test_params{ CASE_MVN_3D_U8_2, 2, 6 },
         mvn_test_params{ CASE_MVN_3D_U8_3, 2, 6 },
         mvn_test_params{ CASE_MVN_3D_U8_4, 2, 6 },
-        mvn_test_params{ CASE_MVN_3D_U8_5, 4, 6 },
+        mvn_test_params{ CASE_MVN_3D_U8_5, 3, 6 },
 }), );
 
 class mvn_eltwise : public MVNFusingTest {};
@@ -4185,14 +4185,14 @@ TEST_P(deconv_scale_activation_quantize_i8_eltwise_quantize_u8, basic) {
 
 INSTANTIATE_TEST_CASE_P(fusings_gpu, deconv_scale_activation_quantize_i8_eltwise_quantize_u8,
                         ::testing::ValuesIn(std::vector<conv_eltw_test_params>{
-                                conv_eltw_test_params{CASE_DECONV_ELTW_FP32_1, 5, 7},
-                                conv_eltw_test_params{CASE_DECONV_ELTW_FP32_2, 5, 7},
-                                conv_eltw_test_params{CASE_DECONV_ELTW_FP32_3, 5, 7},
-                                conv_eltw_test_params{CASE_DECONV_ELTW_FP32_4, 5, 7},
-                                conv_eltw_test_params{CASE_DECONV_ELTW_FP32_5, 5, 7},
-                                conv_eltw_test_params{CASE_DECONV_ELTW_FP32_6, 5, 7},
-                                conv_eltw_test_params{CASE_DECONV_ELTW_FP32_7, 5, 7},
-                                conv_eltw_test_params{CASE_DECONV_ELTW_FP32_8, 5, 7},
+                                conv_eltw_test_params{CASE_DECONV_ELTW_FP32_1, 4, 7},
+                                conv_eltw_test_params{CASE_DECONV_ELTW_FP32_2, 4, 7},
+                                conv_eltw_test_params{CASE_DECONV_ELTW_FP32_3, 4, 7},
+                                conv_eltw_test_params{CASE_DECONV_ELTW_FP32_4, 4, 7},
+                                conv_eltw_test_params{CASE_DECONV_ELTW_FP32_5, 4, 7},
+                                conv_eltw_test_params{CASE_DECONV_ELTW_FP32_6, 4, 7},
+                                conv_eltw_test_params{CASE_DECONV_ELTW_FP32_7, 4, 7},
+                                conv_eltw_test_params{CASE_DECONV_ELTW_FP32_8, 4, 7},
 
                                 conv_eltw_test_params{CASE_DECONV_ELTW_i8_1, 2, 7},
                                 conv_eltw_test_params{CASE_DECONV_ELTW_i8_2, 2, 7},
@@ -5818,3 +5818,137 @@ INSTANTIATE_TEST_CASE_P(
         space_to_batch_test_params{CASE_SPACE_TO_BATCH_I8_1, 2, 5},
         space_to_batch_test_params{CASE_SPACE_TO_BATCH_I8_2, 2, 5},
     }), );
+
+/* ----------------------------------------------------------------------------------------------------- */
+/* ---------------------------------------- Eltwise cases -------------------------------------------------- */
+/* ----------------------------------------------------------------------------------------------------- */
+struct eltwise_test_params {
+    tensor input_size;
+    data_types input_type;
+    data_types input_type2;
+    format input_format;
+    data_types default_type;
+    format default_format;
+    eltwise_mode mode;
+    size_t expected_fused_primitives;
+    size_t expected_not_fused_primitives;
+};
+
+#define CASE_ELTWISE_FP32_1         {2, 16, 4, 4}, data_types::f32, data_types::f32, format::bfyx,           data_types::f32,  format::bfyx,             eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_FP32_2         {2, 16, 4, 4}, data_types::f32, data_types::f32, format::bfzyx,          data_types::f32,  format::bfzyx,            eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_FP32_3         {2, 16, 4, 4}, data_types::f32, data_types::f32, format::b_fs_yx_fsv16,  data_types::f32,  format::b_fs_yx_fsv16,    eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_FP16_1         {2, 16, 4, 4}, data_types::f16, data_types::f16, format::bfyx,           data_types::f16,  format::bfyx,             eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_FP16_2         {2, 16, 4, 4}, data_types::f16, data_types::f16, format::bfzyx,          data_types::f16,  format::bfzyx,            eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_FP16_3         {2, 16, 4, 4}, data_types::f16, data_types::f16, format::b_fs_yx_fsv16,  data_types::f16,  format::b_fs_yx_fsv16,    eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_I8_1           {2, 16, 4, 4}, data_types::i8,  data_types::i8,  format::bfyx,           data_types::f32,  format::bfyx,             eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_I8_2           {2, 16, 4, 4}, data_types::i8,  data_types::i8,  format::bfzyx,          data_types::f32,  format::bfzyx,            eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_I8_3           {2, 16, 4, 4}, data_types::i8,  data_types::i8,  format::b_fs_yx_fsv16,  data_types::f32,  format::b_fs_yx_fsv16,    eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_U8_1           {2, 16, 4, 4}, data_types::u8,  data_types::u8,  format::bfyx,           data_types::f32,  format::bfyx,             eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_U8_2           {2, 16, 4, 4}, data_types::u8,  data_types::u8,  format::bfzyx,          data_types::f32,  format::bfzyx,            eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_U8_3           {2, 16, 4, 4}, data_types::u8,  data_types::u8,  format::b_fs_yx_fsv16,  data_types::f32,  format::b_fs_yx_fsv16,    eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_FP32_FP16_1    {2, 16, 4, 4}, data_types::f32, data_types::f16, format::bfyx,           data_types::f32,  format::bfyx,             eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_FP32_FP16_2    {2, 16, 4, 4}, data_types::f32, data_types::f16, format::bfzyx,          data_types::f32,  format::bfzyx,            eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_FP32_FP16_3    {2, 16, 4, 4}, data_types::f32, data_types::f16, format::b_fs_yx_fsv16,  data_types::f32,  format::b_fs_yx_fsv16,    eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_FP16_FP32_1    {2, 16, 4, 4}, data_types::f16, data_types::f32, format::bfyx,           data_types::f16,  format::bfyx,             eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_FP16_FP32_2    {2, 16, 4, 4}, data_types::f16, data_types::f32, format::bfzyx,          data_types::f16,  format::bfzyx,            eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_FP16_FP32_3    {2, 16, 4, 4}, data_types::f16, data_types::f32, format::b_fs_yx_fsv16,  data_types::f16,  format::b_fs_yx_fsv16,    eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_I8_FP16_1      {2, 16, 4, 4}, data_types::i8,  data_types::f16, format::bfyx,           data_types::f32,  format::bfyx,             eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_I8_FP16_2      {2, 16, 4, 4}, data_types::i8,  data_types::f16, format::bfzyx,          data_types::f32,  format::bfzyx,            eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_I8_FP16_3      {2, 16, 4, 4}, data_types::i8,  data_types::f16, format::b_fs_yx_fsv16,  data_types::f32,  format::b_fs_yx_fsv16,    eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_I8_FP32_1      {2, 16, 4, 4}, data_types::i8,  data_types::f32, format::bfyx,           data_types::f16,  format::bfyx,             eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_I8_FP32_2      {2, 16, 4, 4}, data_types::i8,  data_types::f32, format::bfzyx,          data_types::f16,  format::bfzyx,            eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_I8_FP32_3      {2, 16, 4, 4}, data_types::i8,  data_types::f32, format::b_fs_yx_fsv16,  data_types::f16,  format::b_fs_yx_fsv16,    eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_U8_FP16_1      {2, 16, 4, 4}, data_types::u8,  data_types::f16, format::bfyx,           data_types::f32,  format::bfyx,             eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_U8_FP16_2      {2, 16, 4, 4}, data_types::u8,  data_types::f16, format::bfzyx,          data_types::f32,  format::bfzyx,            eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_U8_FP16_3      {2, 16, 4, 4}, data_types::u8,  data_types::f16, format::b_fs_yx_fsv16,  data_types::f32,  format::b_fs_yx_fsv16,    eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_U8_FP32_1      {2, 16, 4, 4}, data_types::u8,  data_types::f32, format::bfyx,           data_types::f16,  format::bfyx,             eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_U8_FP32_2      {2, 16, 4, 4}, data_types::u8,  data_types::f32, format::bfzyx,          data_types::f16,  format::bfzyx,            eltwise_mode::sum, 3, 4
+#define CASE_ELTWISE_U8_FP32_3      {2, 16, 4, 4}, data_types::u8,  data_types::f32, format::b_fs_yx_fsv16,  data_types::f16,  format::b_fs_yx_fsv16,    eltwise_mode::sum, 3, 4
+
+
+class EltwiseFusingTest : public ::BaseFusingTest<eltwise_test_params> {
+public:
+    void execute(eltwise_test_params& p) {
+        auto input_prim = get_mem(get_input_layout(p));
+        auto input_prim2 = get_mem(get_input_layout2(p));
+
+        network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+        network network_fused(this->engine, this->topology_fused, bo_fused);
+
+        network_fused.set_input_data("input", input_prim);
+        network_fused.set_input_data("input2", input_prim2);
+        network_not_fused.set_input_data("input", input_prim);
+        network_not_fused.set_input_data("input2", input_prim2);
+
+        compare(network_not_fused, network_fused, p);
+    }
+
+    layout get_input_layout(eltwise_test_params& p) { return layout{p.input_type, p.input_format, p.input_size}; }
+    layout get_input_layout2(eltwise_test_params& p) { return layout{p.input_type2, p.input_format, p.input_size}; }
+
+    layout get_per_channel_layout(eltwise_test_params& p) {
+        return layout{p.default_type, p.default_format, tensor{1, p.input_size.feature[0], 1, 1}};
+    }
+};
+
+class eltwise_quantize : public EltwiseFusingTest {};
+TEST_P(eltwise_quantize, u8) {
+    auto p = GetParam();
+    create_topologies(input_layout("input", get_input_layout(p)),
+                      input_layout("input2", get_input_layout2(p)),
+                      eltwise("eltwise", {"input", "input2"}, p.mode, p.default_type),
+                      data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)),
+                      data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)),
+                      data("out_lo", get_mem(get_single_element_layout(p), 0)),
+                      data("out_hi", get_mem(get_single_element_layout(p), 255)),
+                      quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8),
+                      reorder("out", "quantize", p.default_format, data_types::f32));
+
+    tolerance = 1.f;
+    execute(p);
+}
+
+TEST_P(eltwise_quantize, i8_per_channel) {
+    auto p = GetParam();
+    create_topologies(input_layout("input", get_input_layout(p)),
+                      input_layout("input2", get_input_layout2(p)),
+                      eltwise("eltwise", {"input", "input2"}, p.mode, p.default_type),
+                      data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
+                      data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
+                      data("out_lo", get_mem(get_single_element_layout(p), -128)),
+                      data("out_hi", get_mem(get_single_element_layout(p), 127)),
+                      quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8),
+                      reorder("out", "quantize", p.default_format, data_types::f32));
+
+    tolerance = 1.f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu,
+                        eltwise_quantize,
+                        ::testing::ValuesIn(std::vector<eltwise_test_params>{
+                            eltwise_test_params{CASE_ELTWISE_FP16_1},
+                            eltwise_test_params{CASE_ELTWISE_FP16_2},
+                            eltwise_test_params{CASE_ELTWISE_FP16_3},
+                            eltwise_test_params{CASE_ELTWISE_FP32_1},
+                            eltwise_test_params{CASE_ELTWISE_FP32_2},
+                            eltwise_test_params{CASE_ELTWISE_FP32_3},
+                            eltwise_test_params{CASE_ELTWISE_FP32_FP16_1},
+                            eltwise_test_params{CASE_ELTWISE_FP32_FP16_2},
+                            eltwise_test_params{CASE_ELTWISE_FP32_FP16_3},
+                            eltwise_test_params{CASE_ELTWISE_FP16_FP32_1},
+                            eltwise_test_params{CASE_ELTWISE_FP16_FP32_2},
+                            eltwise_test_params{CASE_ELTWISE_FP16_FP32_3},
+                            eltwise_test_params{CASE_ELTWISE_I8_FP32_1},
+                            eltwise_test_params{CASE_ELTWISE_I8_FP32_2},
+                            eltwise_test_params{CASE_ELTWISE_I8_FP32_3},
+                            eltwise_test_params{CASE_ELTWISE_U8_FP32_1},
+                            eltwise_test_params{CASE_ELTWISE_U8_FP32_2},
+                            eltwise_test_params{CASE_ELTWISE_U8_FP32_3},
+                            eltwise_test_params{CASE_ELTWISE_I8_FP16_1},
+                            eltwise_test_params{CASE_ELTWISE_I8_FP16_2},
+                            eltwise_test_params{CASE_ELTWISE_I8_FP16_3},
+                            eltwise_test_params{CASE_ELTWISE_U8_FP16_1},
+                            eltwise_test_params{CASE_ELTWISE_U8_FP16_2},
+                            eltwise_test_params{CASE_ELTWISE_U8_FP16_3},
+                        }), );