From b9c78605039f839f3c79ad8fca4f60ea9a5654ed Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 14 Feb 2023 09:18:07 +0000 Subject: [PATCH] vect: Make partial trapping ops use predication [PR96373] PR96373 points out that a predicated SVE loop currently converts trapping unconditional ops into unpredicated vector ops. Doing the operation on inactive lanes can then raise an exception. As discussed in the PR trail, we aren't 100% consistent about whether we preserve traps or not. But the direction of travel is clearly to improve that rather than live with it. This patch tries to do that for the SVE case. Doing this regresses gcc.target/aarch64/sve/fabd_1.c. I've added -fno-trapping-math for now and filed PR108571 to track it. A similar problem applies to fsubr_1.c. I think this is likely to regress Power 10, since conditional operations are only available for masked loops. I think we'll need to add -fno-trapping-math to any affected testcases, but I don't have a Power 10 system to test on. gcc/ PR tree-optimization/96373 * tree-vect-stmts.cc (vectorizable_operation): Predicate trapping operations on the loop mask. Reject partial vectors if this isn't possible. gcc/testsuite/ PR tree-optimization/96373 PR tree-optimization/108571 * gcc.target/aarch64/sve/fabd_1.c: Add -fno-trapping-math. * gcc.target/aarch64/sve/fsubr_1.c: Likewise. * gcc.target/aarch64/sve/fmul_1.c: Expect predicate ops. * gcc.target/aarch64/sve/fp_arith_1.c: Likewise. --- gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c | 2 +- gcc/testsuite/gcc.target/aarch64/sve/fmul_1.c | 12 ++++----- gcc/testsuite/gcc.target/aarch64/sve/fp_arith_1.c | 12 ++++----- gcc/testsuite/gcc.target/aarch64/sve/fsubr_1.c | 2 +- gcc/tree-vect-stmts.cc | 32 +++++++++++++++++------ 5 files changed, 38 insertions(+), 22 deletions(-) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c index 13ad83b..30bde6f 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O3 --save-temps" } */ +/* { dg-options "-O3 --save-temps -fno-trapping-math" } */ #define N 16 diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fmul_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fmul_1.c index 4a3e7c0..0245a8c 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/fmul_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/fmul_1.c @@ -27,20 +27,20 @@ DO_ARITH_OPS (_Float16, *, mul) DO_ARITH_OPS (float, *, mul) DO_ARITH_OPS (double, *, mul) -/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */ /* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0.5\n} 1 } } */ -/* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #2} } } */ +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #2.0\n} 1 } } */ /* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #5} } } */ /* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #-} } } */ -/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */ /* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0.5\n} 1 } } */ -/* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #2} } } */ +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #2.0\n} 1 } } */ /* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #5} } } */ /* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #-} } } */ -/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */ /* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0.5\n} 1 } } */ -/* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #2} } } */ +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #2.0\n} 1 } } */ /* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #5} } } */ /* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #-} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fp_arith_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fp_arith_1.c index 5aed0dc..419d6e1 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/fp_arith_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/fp_arith_1.c @@ -34,37 +34,37 @@ DO_ARITH_OPS (double, -, minus) /* No specific count because it's valid to use fadd or fsub for the out-of-range constants. */ -/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} } } */ +/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} } } */ /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1.0\n} 2 } } */ /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0.5\n} 2 } } */ /* { dg-final { scan-assembler-not {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #2} } } */ /* { dg-final { scan-assembler-not {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #-} } } */ -/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} } } */ +/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} } } */ /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1.0\n} 2 } } */ /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0.5\n} 2 } } */ /* { dg-final { scan-assembler-not {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #2} } } */ /* { dg-final { scan-assembler-not {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #-} } } */ -/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} } } */ +/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} } } */ /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1.0\n} 2 } } */ /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0.5\n} 2 } } */ /* { dg-final { scan-assembler-not {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #2} } } */ /* { dg-final { scan-assembler-not {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #-} } } */ -/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} } } */ +/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} } } */ /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1.0\n} 2 } } */ /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0.5\n} 2 } } */ /* { dg-final { scan-assembler-not {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #2} } } */ /* { dg-final { scan-assembler-not {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #-} } } */ -/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} } } */ +/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} } } */ /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1.0\n} 2 } } */ /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0.5\n} 2 } } */ /* { dg-final { scan-assembler-not {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #2} } } */ /* { dg-final { scan-assembler-not {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #-} } } */ -/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} } } */ +/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} } } */ /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1.0\n} 2 } } */ /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0.5\n} 2 } } */ /* { dg-final { scan-assembler-not {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #2} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fsubr_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fsubr_1.c index f47a360..012cf6e 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/fsubr_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/fsubr_1.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O3 --save-temps" } */ +/* { dg-options "-O3 --save-temps -fno-trapping-math" } */ #define DO_IMMEDIATE_OPS(VALUE, TYPE, NAME) \ void vsubrarithimm_##NAME##_##TYPE (TYPE *dst, int count) \ diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 09b5af6..df6239a 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -6332,6 +6332,7 @@ vectorizable_operation (vec_info *vinfo, int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info); vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL); internal_fn cond_fn = get_conditional_internal_fn (code); + bool could_trap = gimple_could_trap_p (stmt); if (!vec_stmt) /* transformation not required. */ { @@ -6340,7 +6341,7 @@ vectorizable_operation (vec_info *vinfo, keeping the inactive lanes as-is. */ if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) - && reduc_idx >= 0) + && (could_trap || reduc_idx >= 0)) { if (cond_fn == IFN_LAST || !direct_internal_fn_supported_p (cond_fn, vectype, @@ -6483,16 +6484,31 @@ vectorizable_operation (vec_info *vinfo, vop1 = ((op_type == binary_op || op_type == ternary_op) ? vec_oprnds1[i] : NULL_TREE); vop2 = ((op_type == ternary_op) ? vec_oprnds2[i] : NULL_TREE); - if (masked_loop_p && reduc_idx >= 0) + if (masked_loop_p && (reduc_idx >= 0 || could_trap)) { - /* Perform the operation on active elements only and take - inactive elements from the reduction chain input. */ - gcc_assert (!vop2); - vop2 = reduc_idx == 1 ? vop1 : vop0; tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, vectype, i); - gcall *call = gimple_build_call_internal (cond_fn, 4, mask, - vop0, vop1, vop2); + auto_vec vops (5); + vops.quick_push (mask); + vops.quick_push (vop0); + if (vop1) + vops.quick_push (vop1); + if (vop2) + vops.quick_push (vop2); + if (reduc_idx >= 0) + { + /* Perform the operation on active elements only and take + inactive elements from the reduction chain input. */ + gcc_assert (!vop2); + vops.quick_push (reduc_idx == 1 ? vop1 : vop0); + } + else + { + auto else_value = targetm.preferred_else_value + (cond_fn, vectype, vops.length () - 1, &vops[1]); + vops.quick_push (else_value); + } + gcall *call = gimple_build_call_internal_vec (cond_fn, vops); new_temp = make_ssa_name (vec_dest, call); gimple_call_set_lhs (call, new_temp); gimple_call_set_nothrow (call, true); -- 2.7.4