From 698114484e668abf28c70d989bc93fa399dda9ac Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Mon, 7 Oct 2019 08:36:06 +0000 Subject: [PATCH] [i386] Make the vzeroupper pattern describe its effects (PR91994) The problem in this PR was that vzeroupper has an effect on register contents, but those effects weren't modelled in the rtl pattern, which was just an unspec_volatile. This patch fixes that by running a subpass after vzeroupper insertion to add SETs and CLOBBERs as appropriate. See the comments in the patch for more details. 2019-10-07 Richard Sandiford gcc/ PR target/91994 * config/i386/sse.md (avx_vzeroupper): Turn into a define_expand and wrap the unspec_volatile in a parallel. (*avx_vzeroupper): New define_insn. Use a match_parallel around the unspec_volatile. * config/i386/predicates.md (vzeroupper_pattern): Expect the unspec_volatile to be wrapped in a parallel. * config/i386/i386-features.c (ix86_add_reg_usage_to_vzeroupper) (ix86_add_reg_usage_to_vzerouppers): New functions. (rest_of_handle_insert_vzeroupper): Use them to add register usage information to the vzeroupper instructions. gcc/testsuite/ PR target/91994 * gcc.target/i386/pr91994.c: New test. From-SVN: r276648 --- gcc/ChangeLog | 14 ++++++++ gcc/config/i386/i386-features.c | 63 +++++++++++++++++++++++++++++++++ gcc/config/i386/predicates.md | 5 +-- gcc/config/i386/sse.md | 13 +++++-- gcc/testsuite/ChangeLog | 5 +++ gcc/testsuite/gcc.target/i386/pr91994.c | 35 ++++++++++++++++++ 6 files changed, 130 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr91994.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index ef1eb59..ef8731f 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,17 @@ +2019-10-07 Richard Sandiford + + PR target/91994 + * config/i386/sse.md (avx_vzeroupper): Turn into a define_expand + and wrap the unspec_volatile in a parallel. + (*avx_vzeroupper): New define_insn. Use a match_parallel around + the unspec_volatile. + * config/i386/predicates.md (vzeroupper_pattern): Expect the + unspec_volatile to be wrapped in a parallel. + * config/i386/i386-features.c (ix86_add_reg_usage_to_vzeroupper) + (ix86_add_reg_usage_to_vzerouppers): New functions. + (rest_of_handle_insert_vzeroupper): Use them to add register + usage information to the vzeroupper instructions. + 2019-10-07 Richard Biener PR tree-optimization/91975 diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c index 9b297ba..4781a33 100644 --- a/gcc/config/i386/i386-features.c +++ b/gcc/config/i386/i386-features.c @@ -1757,6 +1757,68 @@ convert_scalars_to_vector (bool timode_p) return 0; } +/* Modify the vzeroupper pattern in INSN so that it describes the effect + that the instruction has on the SSE registers. LIVE_REGS are the set + of registers that are live across the instruction. + + For a live register R we use: + + (set (reg:V2DF R) (reg:V2DF R)) + + which preserves the low 128 bits but clobbers the upper bits. + For a dead register we just use: + + (clobber (reg:V2DF R)) + + which invalidates any previous contents of R and stops R from becoming + live across the vzeroupper in future. */ + +static void +ix86_add_reg_usage_to_vzeroupper (rtx_insn *insn, bitmap live_regs) +{ + rtx pattern = PATTERN (insn); + unsigned int nregs = TARGET_64BIT ? 16 : 8; + rtvec vec = rtvec_alloc (nregs + 1); + RTVEC_ELT (vec, 0) = XVECEXP (pattern, 0, 0); + for (unsigned int i = 0; i < nregs; ++i) + { + unsigned int regno = GET_SSE_REGNO (i); + rtx reg = gen_rtx_REG (V2DImode, regno); + if (bitmap_bit_p (live_regs, regno)) + RTVEC_ELT (vec, i + 1) = gen_rtx_SET (reg, reg); + else + RTVEC_ELT (vec, i + 1) = gen_rtx_CLOBBER (VOIDmode, reg); + } + XVEC (pattern, 0) = vec; + df_insn_rescan (insn); +} + +/* Walk the vzeroupper instructions in the function and annotate them + with the effect that they have on the SSE registers. */ + +static void +ix86_add_reg_usage_to_vzerouppers (void) +{ + basic_block bb; + rtx_insn *insn; + auto_bitmap live_regs; + + df_analyze (); + FOR_EACH_BB_FN (bb, cfun) + { + bitmap_copy (live_regs, df_get_live_out (bb)); + df_simulate_initialize_backwards (bb, live_regs); + FOR_BB_INSNS_REVERSE (bb, insn) + { + if (!NONDEBUG_INSN_P (insn)) + continue; + if (vzeroupper_pattern (PATTERN (insn), VOIDmode)) + ix86_add_reg_usage_to_vzeroupper (insn, live_regs); + df_simulate_one_insn_backwards (bb, insn, live_regs); + } + } +} + static unsigned int rest_of_handle_insert_vzeroupper (void) { @@ -1773,6 +1835,7 @@ rest_of_handle_insert_vzeroupper (void) /* Call optimize_mode_switching. */ g->get_passes ()->execute_pass_mode_switching (); + ix86_add_reg_usage_to_vzerouppers (); return 0; } diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 72f8e7e..31f1cea 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1441,8 +1441,9 @@ ;; return true if OP is a vzeroupper pattern. (define_predicate "vzeroupper_pattern" - (and (match_code "unspec_volatile") - (match_test "XINT (op, 1) == UNSPECV_VZEROUPPER"))) + (and (match_code "parallel") + (match_code "unspec_volatile" "a") + (match_test "XINT (XVECEXP (op, 0, 0), 1) == UNSPECV_VZEROUPPER"))) ;; Return true if OP is an addsub vec_merge operation (define_predicate "addsub_vm_operator" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index c7f539f..07922a1 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -19622,9 +19622,16 @@ (set_attr "mode" "OI")]) ;; Clear the upper 128bits of AVX registers, equivalent to a NOP -;; if the upper 128bits are unused. -(define_insn "avx_vzeroupper" - [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)] +;; if the upper 128bits are unused. Initially we expand the instructions +;; as though they had no effect on the SSE registers, but later add SETs and +;; CLOBBERs to the PARALLEL to model the real effect. +(define_expand "avx_vzeroupper" + [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] + "TARGET_AVX") + +(define_insn "*avx_vzeroupper" + [(match_parallel 0 "vzeroupper_pattern" + [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] "TARGET_AVX" "vzeroupper" [(set_attr "type" "sse") diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index e8e0060..01253be 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2019-10-07 Richard Sandiford + + PR target/91994 + * gcc.target/i386/pr91994.c: New test. + 2019-10-07 Richard Biener PR tree-optimization/91975 diff --git a/gcc/testsuite/gcc.target/i386/pr91994.c b/gcc/testsuite/gcc.target/i386/pr91994.c new file mode 100644 index 0000000..033be68 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr91994.c @@ -0,0 +1,35 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx } */ +/* { dg-options "-O2 -mavx -mvzeroupper" } */ + +#include "avx-check.h" + +#include + +__m256i x1, x2, x3; + +__attribute__ ((noinline)) +static void +foo (void) +{ + x1 = x2; +} + +void +bar (void) +{ + __m256i x = x1; + foo (); + x3 = x; +} + +__attribute__ ((noinline)) +void +avx_test (void) +{ + __m256i x = _mm256_set1_epi8 (3); + x1 = x; + bar (); + if (__builtin_memcmp (&x3, &x, sizeof (x))) + __builtin_abort (); +} -- 2.7.4