From f1b13064609a41fcaf4d1859663453bba237e277 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 3 Mar 2021 10:06:14 +0100 Subject: [PATCH] i386: Fix a peephole2 for -mavx512vl -mno-avx512bw [PR99321] As the testcase shows, the (define_peephole2 [(set (match_operand 0 "sse_reg_operand") (match_operand 1 "sse_reg_operand")) (set (match_dup 0) (match_operator 3 "commutative_operator" [(match_dup 0) (match_operand 2 "memory_operand")]))] peephole2 can for AVX512VL without AVX512BW (I guess it is a hyphothetical CPU, but unfortunately they are separate CPUID bits and we have separate options for them) turn something that is valid without that peephole2 into something that is invalid (and in this case ICEs). The problem is that the vpadd[bw], vpmullw, vpmin[su][bw] and vpmax[su][bw] instructions require both AVX512BW and AVX512VL when they have 16-byte or 32-byte operands. If operands[0] is %[xy]mm0 .. %[xy]mm15 but operands[1] is %[xy]mm16 .. %[xy]mm31, then before we have a vector move which uses vmovdqa{32,64} and doesn't need AVX512BW, AVX512VL is I think implied from HARD_REGNO_MODE_OK only supporting V{16Q,32Q,8H,16H}imode in EXT_REX_SSE_REGNO_P regs with AVX512VL, and then we have a commutative operation with that %[xy]mm0 .. %[xy]mm15 destination and one source and a memory operand, so VEX encoded operation. And, the peephole2 wants to replace it with a load into the destination register from memory (ok) and then the commutative arith instruction. But that needs EVEX encoding because of the high register and so requires AVX512BW which might not be enabled. The exception is and/ior/xor, because the hw doesn't have vp{and,or,xor}{b,w} instructions at all, it uses vp{and,or,xor}d instead and that of course doesn't need AVX512BW. BTW, there are other bugs I need to look at, while the vp{min,max}ub with 16-byte operands instruction properly requires avx512bw for v constraints and otherwise uses x, e.g. the vpadd[bw] etc. instructions don't. I'll try to handle that incrementally later this week. 2021-03-03 Jakub Jelinek PR target/99321 * config/i386/predicates.md (logic_operator): New define_predicate. * config/i386/i386.md (mov + mem using comm arith peephole2): Punt if operands[1] is EXT_REX_SSE_REGNO_P, AVX512BW is not enabled and the inner mode is [QH]Imode. * gcc.target/i386/pr99321.c: New test. --- gcc/config/i386/i386.md | 11 ++++++++- gcc/config/i386/predicates.md | 4 ++++ gcc/testsuite/gcc.target/i386/pr99321.c | 41 +++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr99321.c diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 333dc9a..2820f6d 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -19843,7 +19843,16 @@ (match_operator 3 "commutative_operator" [(match_dup 0) (match_operand 2 "memory_operand")]))] - "REGNO (operands[0]) != REGNO (operands[1])" + "REGNO (operands[0]) != REGNO (operands[1]) + /* Punt if operands[1] is %[xy]mm16+ and AVX512BW is not enabled, + as EVEX encoded vpadd[bw], vpmullw, vpmin[su][bw] and vpmax[su][bw] + instructions require AVX512BW and AVX512VL, but with the original + instructions it might require just AVX512VL. + AVX512VL is implied from TARGET_HARD_REGNO_MODE_OK. */ + && (!EXT_REX_SSE_REGNO_P (REGNO (operands[1])) + || TARGET_AVX512BW + || GET_MODE_SIZE (GET_MODE_INNER (GET_MODE (operands[0]))) > 2 + || logic_operator (operands[3], VOIDmode))" [(set (match_dup 0) (match_dup 2)) (set (match_dup 0) (match_op_dup 3 [(match_dup 0) (match_dup 1)]))]) diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index ee42ba2..b6dd5e9 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1486,6 +1486,10 @@ (define_predicate "div_operator" (match_code "div")) +;; Return true if this is a and, ior or xor operation. +(define_predicate "logic_operator" + (match_code "and,ior,xor")) + ;; Return true if this is a plus, minus, and, ior or xor operation. (define_predicate "plusminuslogic_operator" (match_code "plus,minus,and,ior,xor")) diff --git a/gcc/testsuite/gcc.target/i386/pr99321.c b/gcc/testsuite/gcc.target/i386/pr99321.c new file mode 100644 index 0000000..d290b67 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr99321.c @@ -0,0 +1,41 @@ +/* PR target/99321 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mtune=btver2 -fno-tree-dce -mavx512vl -mno-avx512bw" } */ + +typedef unsigned __attribute__((__vector_size__ (8))) A; +typedef unsigned int __attribute__((__vector_size__ (8))) B; +typedef unsigned char __attribute__((__vector_size__ (16))) C; +typedef unsigned __attribute__((__vector_size__ (16))) D; +typedef unsigned int __attribute__((__vector_size__ (16))) E; +typedef unsigned __attribute__((__vector_size__ (16))) F; +typedef unsigned __attribute__((__vector_size__ (32))) G; +typedef int __attribute__((__vector_size__ (32))) H; +typedef unsigned int __attribute__((__vector_size__ (32))) I; +typedef char __attribute__((__vector_size__ (64))) J; +typedef unsigned int __attribute__((__vector_size__ (64))) K; +typedef unsigned long long __attribute__((__vector_size__ (64))) L; +unsigned char a; +unsigned b, c; +H d; +E e, f; +D g; +L h; + +A +foo0 (A i, C j, G k, B l, K m, B n, I o) +{ + J p, q = a != p; + F r = b << f; + int s = a * 15; + C t = (1 << ((C) ((C) { 80 } >=j) & sizeof (0)) | (j ^ (C) { 5 }) << (j & sizeof (0))) != 0; + L u = h; + H v = d - 40; + u ^= -(long long) n; + D w = (char) s > g; + o ^= c / o; + J x = p + q + (J) m + (J) u + (J) u; + G y = ((union { J a; G b;}) x).b + ((union { J a; G b[2];}) x).b[1] + k + v + o; + C z = ((union { G a; C b;}) y).b + ((union { G a; C b;}) y).b + j + t + (C) g + (C) w + (C) e + (C) f + (C) r; + A zz = ((union { C a; A b;}) z).b + i + l + n; + return zz; +} -- 2.7.4