return false;
};
+ auto eltwise_supports_fusings = [&](eltwise_node& node) -> bool {
+ auto out_layout = node.get_output_layout();
+ if (out_layout.data_type == data_types::f16 && out_layout.size.batch[0] > 1 &&
+ (_lo.get_optimization_attributes().fs_b_yx_fsv32_network || out_layout.format == format::fs_b_yx_fsv32)) {
+ return false;
+ }
+
+ return true;
+ };
+
auto fuse_activation_f = [&](activation_node& activation_node) {
auto& input_data = activation_node.get_dependency(0);
if (input_data.get_users().size() != 1 || activation_node.get_dependencies().size() >= 3)
should_fuse |= input_data.is_type<scale>();
- should_fuse |= input_data.is_type<eltwise>();
+ should_fuse |= input_data.is_type<eltwise>() && eltwise_supports_fusings(input_data.as<eltwise>());
if (!should_fuse)
return;
should_fuse |= input_data.is_type<scale>();
- should_fuse |= input_data.is_type<eltwise>();
+ should_fuse |= input_data.is_type<eltwise>() && eltwise_supports_fusings(input_data.as<eltwise>());
if (!should_fuse)
return;
reduce_supports_fusings(input_data.as<reduce>())
&& quantize_node.get_scale_shift_opt();
- should_fuse |= input_data.is_type<eltwise>() && quantize_node.get_scale_shift_opt();
+ should_fuse |= input_data.is_type<eltwise>() && eltwise_supports_fusings(input_data.as<eltwise>()) && quantize_node.get_scale_shift_opt();
should_fuse |= input_data.is_type<scale>() && quantize_node.get_scale_shift_opt();
(parents[i]->is_type<gemm>() && gemm_supports_fusings(parents[i]->as<gemm>())) ||
(parents[i]->is_type<batch_to_space>()) ||
(parents[i]->is_type<space_to_batch>()) ||
- (parents[i]->is_type<eltwise>()) ||
+ (parents[i]->is_type<eltwise>() && eltwise_supports_fusings(parents[i]->as<eltwise>())) ||
(parents[i]->is_type<scale>()) ||
(parents[i]->is_type<depth_to_space>() && dts_supports_fusings(parents[i]->as<depth_to_space>())) ||
(parents[i]->is_type<reduce>() && reduce_supports_fusings(parents[i]->as<reduce>()));
#define CASE_ELTWISE_FP16_1 {2, 16, 4, 4}, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx, eltwise_mode::sum
#define CASE_ELTWISE_FP16_2 {2, 16, 4, 4}, data_types::f16, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx, eltwise_mode::sum
#define CASE_ELTWISE_FP16_3 {2, 32, 4, 8}, data_types::f16, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::b_fs_yx_fsv16, eltwise_mode::sum
+#define CASE_ELTWISE_FP16_4 {3, 32, 4, 4}, data_types::f16, data_types::f16, format::fs_b_yx_fsv32, data_types::f16, format::fs_b_yx_fsv32, eltwise_mode::sum
#define CASE_ELTWISE_I8_1 {2, 16, 4, 4}, data_types::i8, data_types::i8, format::bfyx, data_types::f32, format::bfyx, eltwise_mode::sum
#define CASE_ELTWISE_I8_2 {2, 16, 4, 4}, data_types::i8, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx, eltwise_mode::sum
#define CASE_ELTWISE_I8_3 {2, 16, 4, 4}, data_types::i8, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16, eltwise_mode::sum
eltwise_test_params{CASE_ELTWISE_FP32_3, 3, 5},
}), );
+class eltwise_fp32_fsv32 : public EltwiseFusingTest {};
+TEST_P(eltwise_fp32_fsv32, add) {
+ auto p = GetParam();
+ create_topologies(input_layout("input", get_input_layout(p)),
+ input_layout("input2", get_input_layout2(p)),
+ data("add_data", get_mem(get_per_channel_layout(p), -10, 10)),
+ eltwise("eltwise", {"input", "input2"}, p.mode, p.default_type),
+ eltwise("add", {"eltwise", "add_data"}, eltwise_mode::sum),
+ activation("activation", "add", activation_func::negative),
+ reorder("out", "activation", p.default_format, data_types::f32));
+
+ implementation_desc eltw_impl = { format::fs_b_yx_fsv32, "eltwise_fs_b_yx_fsv32" };
+ bo_fused.set_option(build_option::force_implementations({ {"eltwise", eltw_impl} }));
+
+ tolerance = 1e-5f;
+ execute(p);
+}
+
+TEST_P(eltwise_fp32_fsv32, add_per_element) {
+ auto p = GetParam();
+ create_topologies(input_layout("input", get_input_layout(p)),
+ input_layout("input2", get_input_layout2(p)),
+ data("add_data", get_mem(get_input_layout(p), -10, 10)),
+ eltwise("eltwise", {"input", "input2"}, p.mode, p.default_type),
+ eltwise("add", {"eltwise", "add_data"}, eltwise_mode::sum),
+ activation("activation", "add", activation_func::negative),
+ reorder("out", "activation", p.default_format, data_types::f32));
+
+ implementation_desc eltw_impl = { format::fs_b_yx_fsv32, "eltwise_fs_b_yx_fsv32" };
+ bo_fused.set_option(build_option::force_implementations({ {"eltwise", eltw_impl} }));
+
+ tolerance = 1e-5f;
+ execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu,
+ eltwise_fp32_fsv32,
+ ::testing::ValuesIn(std::vector<eltwise_test_params>{
+ // There's no optimized eltwise kernel yet for fsv32 layout that supports fused_ops
+ // So only activation is fused via legacy mechanism
+ eltwise_test_params{CASE_ELTWISE_FP16_4, 4, 5},
+ }), );
+
class eltwise_fp32_fused_prims : public EltwiseFusingTest {};
TEST_P(eltwise_fp32_fused_prims, scale_activation) {
auto p = GetParam();