From 46ca31d65092e5afcef292f807fcf14c5363280d Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 18 May 2021 17:25:54 +0200 Subject: [PATCH] i386: Implement 4-byte vector support [PR100637] MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add infrastructure, logic and arithmetic support for 4-byte vectors. These can be used with SSE2 targets, where movd instructions from/to XMM registers are available. x86_64 ABI passes 4-byte vectors in integer registers, so also add logic operations with integer registers. 2021-05-18 Uroš Bizjak gcc/ PR target/100637 * config/i386/i386.h (VALID_SSE2_REG_MODE): Add V4QI and V2HI modes. (VALID_INT_MODE_P): Ditto. * config/i386/mmx.md (VI_32): New mode iterator. (mmxvecsize): Handle V4QI and V2HI. (Yv_Yw): Ditto. (mov): New expander. (*mov_internal): New insn pattern. (movmisalign): New expander. (neg): New expander. (3): New expander. (*3): New insn pattern. (mulv2hi3): New expander. (*mulv2hi3): New insn pattern. (one_cmpl2): New expander. (*andnot3): New insn pattern. (3): New expander. (*3): New insn pattern. gcc/testsuite/ PR target/100637 * gcc.target/i386/pr100637-1b.c: New test. * gcc.target/i386/pr100637-1w.c: Ditto. * gcc.target/i386/pr92658-avx2-2.c: Do not XFAIL scan for pmovsxbq. * gcc.target/i386/pr92658-avx2.c: Do not XFAIL scan for pmovzxbq. * gcc.target/i386/pr92658-avx512vl.c: Do not XFAIL scan for vpmovdb. * gcc.target/i386/pr92658-sse4-2.c: Do not XFAIL scan for pmovsxbd and pmovsxwq. * gcc.target/i386/pr92658-sse4.c: Do not XFAIL scan for pmovzxbd and pmovzxwq. --- gcc/config/i386/i386.h | 15 +- gcc/config/i386/mmx.md | 195 ++++++++++++++++++++++- gcc/testsuite/gcc.target/i386/pr100637-1b.c | 25 +++ gcc/testsuite/gcc.target/i386/pr100637-1w.c | 28 ++++ gcc/testsuite/gcc.target/i386/pr92658-avx2-2.c | 2 +- gcc/testsuite/gcc.target/i386/pr92658-avx2.c | 2 +- gcc/testsuite/gcc.target/i386/pr92658-avx512vl.c | 2 +- gcc/testsuite/gcc.target/i386/pr92658-sse4-2.c | 4 +- gcc/testsuite/gcc.target/i386/pr92658-sse4.c | 4 +- 9 files changed, 260 insertions(+), 17 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr100637-1b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr100637-1w.c diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 97d6f38..d15f9b2 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1007,6 +1007,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #define VALID_SSE2_REG_MODE(MODE) \ ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode \ + || (MODE) == V4QImode || (MODE) == V2HImode \ || (MODE) == V2DImode || (MODE) == DFmode) #define VALID_SSE_REG_MODE(MODE) \ @@ -1034,12 +1035,14 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); || (MODE) == SCmode || (MODE) == DCmode || (MODE) == XCmode) \ #define VALID_INT_MODE_P(MODE) \ - ((MODE) == QImode || (MODE) == HImode || (MODE) == SImode \ - || (MODE) == DImode \ - || (MODE) == CQImode || (MODE) == CHImode || (MODE) == CSImode \ - || (MODE) == CDImode \ - || (TARGET_64BIT && ((MODE) == TImode || (MODE) == CTImode \ - || (MODE) == TFmode || (MODE) == TCmode))) + ((MODE) == QImode || (MODE) == HImode \ + || (MODE) == SImode || (MODE) == DImode \ + || (MODE) == CQImode || (MODE) == CHImode \ + || (MODE) == CSImode || (MODE) == CDImode \ + || (TARGET_64BIT \ + && ((MODE) == TImode || (MODE) == CTImode \ + || (MODE) == TFmode || (MODE) == TCmode)) \ + || (MODE) == V4QImode || (MODE) == V2HImode) /* Return true for modes passed in SSE registers. */ #define SSE_REG_MODE_P(MODE) \ diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 7fc2e5d..7806b62 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -57,11 +57,15 @@ (define_mode_iterator MMXMODE24 [V4HI V2SI]) (define_mode_iterator MMXMODE248 [V4HI V2SI V1DI]) +;; All 32bit integer vector modes +(define_mode_iterator VI_32 [V4QI V2HI]) + ;; All V2S* modes (define_mode_iterator V2FI [V2SF V2SI]) ;; Mapping from integer vector mode to mnemonic suffix -(define_mode_attr mmxvecsize [(V8QI "b") (V4HI "w") (V2SI "d") (V1DI "q")]) +(define_mode_attr mmxvecsize + [(V8QI "b") (V4QI "b") (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")]) (define_mode_attr mmxdoublemode [(V8QI "V8HI") (V4HI "V4SI")]) @@ -74,7 +78,8 @@ [(V2SF "v2si") (V2SI "v2si") (V4HI "v4hi") (V8QI "v8qi")]) (define_mode_attr Yv_Yw - [(V8QI "Yw") (V4HI "Yw") (V2SI "Yv") (V1DI "Yv") (V2SF "Yv")]) + [(V8QI "Yw") (V4QI "Yw") (V4HI "Yw") (V2HI "Yw") + (V2SI "Yv") (V1DI "Yv") (V2SF "Yv")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; @@ -233,6 +238,80 @@ DONE; }) +(define_expand "mov" + [(set (match_operand:VI_32 0 "nonimmediate_operand") + (match_operand:VI_32 1 "nonimmediate_operand"))] + "TARGET_SSE2" +{ + ix86_expand_vector_move (mode, operands); + DONE; +}) + +(define_insn "*mov_internal" + [(set (match_operand:VI_32 0 "nonimmediate_operand" + "=r ,m ,v,v,v,m,r,v") + (match_operand:VI_32 1 "general_operand" + "rmC,rC,C,v,m,v,v,r"))] + "TARGET_SSE2 && + !(MEM_P (operands[0]) && MEM_P (operands[1]))" +{ + switch (get_attr_type (insn)) + { + case TYPE_IMOV: + return "mov{l}\t{%1, %0|%0, %1}"; + + case TYPE_SSELOG1: + return standard_sse_constant_opcode (insn, operands); + + case TYPE_SSEMOV: + return ix86_output_ssemov (insn, operands); + + default: + gcc_unreachable (); + } +} + [(set (attr "type") + (cond [(eq_attr "alternative" "2") + (const_string "sselog1") + (eq_attr "alternative" "3,4,5,6,7") + (const_string "ssemov") + ] + (const_string "imov"))) + (set (attr "prefix") + (if_then_else (eq_attr "type" "sselog1,ssemov") + (const_string "maybe_vex") + (const_string "orig"))) + (set (attr "prefix_data16") + (if_then_else (and (eq_attr "type" "ssemov") (eq_attr "mode" "SI")) + (const_string "1") + (const_string "*"))) + (set (attr "mode") + (cond [(eq_attr "alternative" "2,3") + (cond [(match_test "TARGET_AVX") + (const_string "TI") + (match_test "optimize_function_for_size_p (cfun)") + (const_string "V4SF") + ] + (const_string "TI")) + ] + (const_string "SI"))) + (set (attr "preferred_for_speed") + (cond [(eq_attr "alternative" "6") + (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC") + (eq_attr "alternative" "7") + (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC") + ] + (symbol_ref "true")))]) + +(define_expand "movmisalign" + [(set (match_operand:VI_32 0 "nonimmediate_operand") + (match_operand:VI_32 1 "nonimmediate_operand"))] + "TARGET_SSE2" +{ + ix86_expand_vector_move (mode, operands); + DONE; +}) + (define_insn "sse_movntq" [(set (match_operand:DI 0 "memory_operand" "=m,m") (unspec:DI [(match_operand:DI 1 "register_operand" "y,r")] @@ -1229,6 +1308,14 @@ ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(define_expand "neg2" + [(set (match_operand:MMXMODEI 0 "register_operand") + (minus:MMXMODEI + (match_dup 2) + (match_operand:MMXMODEI 1 "register_operand")))] + "TARGET_MMX_WITH_SSE" + "operands[2] = force_reg (mode, CONST0_RTX (mode));") + (define_expand "mmx_3" [(set (match_operand:MMXMODEI8 0 "register_operand") (plusminus:MMXMODEI8 @@ -1248,8 +1335,10 @@ (define_insn "*mmx_3" [(set (match_operand:MMXMODEI8 0 "register_operand" "=y,x,") (plusminus:MMXMODEI8 - (match_operand:MMXMODEI8 1 "register_mmxmem_operand" "0,0,") - (match_operand:MMXMODEI8 2 "register_mmxmem_operand" "ym,x,")))] + (match_operand:MMXMODEI8 1 "register_mmxmem_operand" + "0,0,") + (match_operand:MMXMODEI8 2 "register_mmxmem_operand" + "ym,x,")))] "(TARGET_MMX || TARGET_MMX_WITH_SSE) && ix86_binary_operator_ok (, mode, operands)" "@ @@ -1261,6 +1350,36 @@ (set_attr "type" "mmxadd,sseadd,sseadd") (set_attr "mode" "DI,TI,TI")]) +(define_expand "neg2" + [(set (match_operand:VI_32 0 "register_operand") + (minus:VI_32 + (match_dup 2) + (match_operand:VI_32 1 "register_operand")))] + "TARGET_SSE2" + "operands[2] = force_reg (mode, CONST0_RTX (mode));") + +(define_expand "3" + [(set (match_operand:VI_32 0 "register_operand") + (plusminus:VI_32 + (match_operand:VI_32 1 "register_operand") + (match_operand:VI_32 2 "register_operand")))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (, mode, operands);") + +(define_insn "*3" + [(set (match_operand:VI_32 0 "register_operand" "=x,") + (plusminus:VI_32 + (match_operand:VI_32 1 "register_operand" "0,") + (match_operand:VI_32 2 "register_operand" "x,")))] + "TARGET_SSE2 + && ix86_binary_operator_ok (, mode, operands)" + "@ + p\t{%2, %0|%0, %2} + vp\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sseadd") + (set_attr "mode" "TI")]) + (define_expand "mmx_3" [(set (match_operand:MMXMODE12 0 "register_operand") (sat_plusminus:MMXMODE12 @@ -1314,6 +1433,26 @@ (set_attr "type" "mmxmul,ssemul,ssemul") (set_attr "mode" "DI,TI,TI")]) +(define_expand "mulv2hi3" + [(set (match_operand:V2HI 0 "register_operand") + (mult:V2HI (match_operand:V2HI 1 "register_operand") + (match_operand:V2HI 2 "register_operand")))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (MULT, V2HImode, operands);") + +(define_insn "*mulv2hi3" + [(set (match_operand:V2HI 0 "register_operand" "=x,Yw") + (mult:V2HI (match_operand:V2HI 1 "register_operand" "%0,Yw") + (match_operand:V2HI 2 "register_operand" "x,Yw")))] + "TARGET_SSE2 + && ix86_binary_operator_ok (MULT, V2HImode, operands)" + "@ + pmullw\t{%2, %0|%0, %2} + vpmullw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "ssemul") + (set_attr "mode" "TI")]) + (define_expand "mmx_smulv4hi3_highpart" [(set (match_operand:V4HI 0 "register_operand") (truncate:V4HI @@ -1839,6 +1978,14 @@ "TARGET_MMX_WITH_SSE" "operands[2] = force_reg (mode, CONSTM1_RTX (mode));") +(define_expand "one_cmpl2" + [(set (match_operand:VI_32 0 "register_operand") + (xor:VI_32 + (match_operand:VI_32 1 "register_operand") + (match_dup 2)))] + "TARGET_SSE2" + "operands[2] = force_reg (mode, CONSTM1_RTX (mode));") + (define_insn "mmx_andnot3" [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v") (and:MMXMODEI @@ -1855,6 +2002,22 @@ (set_attr "type" "mmxadd,sselog,sselog,sselog") (set_attr "mode" "DI,TI,TI,TI")]) +(define_insn "*andnot3" + [(set (match_operand:VI_32 0 "register_operand" "=r,x,x,v") + (and:VI_32 + (not:VI_32 (match_operand:VI_32 1 "register_operand" "r,0,x,v")) + (match_operand:VI_32 2 "register_operand" "r,x,x,v")))] + "TARGET_SSE2" + "@ + andn\t{%2, %1, %0|%0, %1, %2} + pandn\t{%2, %0|%0, %2} + vpandn\t{%2, %1, %0|%0, %1, %2} + vpandnd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "bmi,noavx,avx,avx512vl") + (set_attr "type" "bitmanip,sselog,sselog,sselog") + (set_attr "btver2_decode" "direct,*,*,*") + (set_attr "mode" "SI,TI,TI,TI")]) + (define_expand "mmx_3" [(set (match_operand:MMXMODEI 0 "register_operand") (any_logic:MMXMODEI @@ -1888,6 +2051,30 @@ (set_attr "type" "mmxadd,sselog,sselog,sselog") (set_attr "mode" "DI,TI,TI,TI")]) +(define_expand "3" + [(set (match_operand:VI_32 0 "register_operand") + (any_logic:VI_32 + (match_operand:VI_32 1 "register_operand") + (match_operand:VI_32 2 "register_operand")))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (, mode, operands);") + +(define_insn "*3" + [(set (match_operand:VI_32 0 "register_operand" "=r,x,x,v") + (any_logic:VI_32 + (match_operand:VI_32 1 "register_operand" "%0,0,x,v") + (match_operand:VI_32 2 "register_operand" "r,x,x,v")))] + "TARGET_SSE2 + && ix86_binary_operator_ok (, mode, operands)" + "@ + \t{%2, %0|%0, %2} + p\t{%2, %0|%0, %2} + vp\t{%2, %1, %0|%0, %1, %2} + vpd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "*,noavx,avx,avx512vl") + (set_attr "type" "alu,sselog,sselog,sselog") + (set_attr "mode" "SI,TI,TI,TI")]) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel integral element swizzling diff --git a/gcc/testsuite/gcc.target/i386/pr100637-1b.c b/gcc/testsuite/gcc.target/i386/pr100637-1b.c new file mode 100644 index 0000000..3e7445a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr100637-1b.c @@ -0,0 +1,25 @@ +/* PR target/100637 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2 -dp" } */ + +typedef char __v4qi __attribute__ ((__vector_size__ (4))); + +__v4qi and (__v4qi a, __v4qi b) { return a & b; }; +/* { dg-final { scan-assembler "andv4qi3" } } */ + +__v4qi andn (__v4qi a, __v4qi b) { return a & ~b; }; +/* { dg-final { scan-assembler "andnotv4qi3" } } */ + +__v4qi or (__v4qi a, __v4qi b) { return a | b; }; +/* { dg-final { scan-assembler "iorv4qi3" } } */ + +__v4qi xor (__v4qi a, __v4qi b) { return a ^ b; }; +__v4qi not (__v4qi a) { return ~a; }; +/* { dg-final { scan-assembler-times "xorv4qi3" 2 } } */ + +__v4qi plus (__v4qi a, __v4qi b) { return a + b; }; +/* { dg-final { scan-assembler "addv4qi3" } } */ + +__v4qi minus (__v4qi a, __v4qi b) { return a - b; }; +__v4qi neg (__v4qi a) { return -a; }; +/* { dg-final { scan-assembler-times "subv4qi3" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr100637-1w.c b/gcc/testsuite/gcc.target/i386/pr100637-1w.c new file mode 100644 index 0000000..ed1baeb --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr100637-1w.c @@ -0,0 +1,28 @@ +/* PR target/100637 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2 -dp" } */ + +typedef short __v2hi __attribute__ ((__vector_size__ (4))); + +__v2hi and (__v2hi a, __v2hi b) { return a & b; }; +/* { dg-final { scan-assembler "andv2hi3" } } */ + +__v2hi andn (__v2hi a, __v2hi b) { return a & ~b; }; +/* { dg-final { scan-assembler "andnotv2hi3" } } */ + +__v2hi or (__v2hi a, __v2hi b) { return a | b; }; +/* { dg-final { scan-assembler "iorv2hi3" } } */ + +__v2hi xor (__v2hi a, __v2hi b) { return a ^ b; }; +__v2hi not (__v2hi a) { return ~a; }; +/* { dg-final { scan-assembler-times "xorv2hi3" 2 } } */ + +__v2hi plus (__v2hi a, __v2hi b) { return a + b; }; +/* { dg-final { scan-assembler "addv2hi3" } } */ + +__v2hi minus (__v2hi a, __v2hi b) { return a - b; }; +__v2hi neg (__v2hi a) { return -a; }; +/* { dg-final { scan-assembler-times "subv2hi3" 2 } } */ + +__v2hi mul (__v2hi a, __v2hi b) { return a * b; }; +/* { dg-final { scan-assembler "mulv2hi3" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr92658-avx2-2.c b/gcc/testsuite/gcc.target/i386/pr92658-avx2-2.c index 7aad858..6c30702 100644 --- a/gcc/testsuite/gcc.target/i386/pr92658-avx2-2.c +++ b/gcc/testsuite/gcc.target/i386/pr92658-avx2-2.c @@ -109,7 +109,7 @@ bar_s8_s64 (v4di * dst, v32qi src) dst[0] = *(v4di *) tem; } -/* { dg-final { scan-assembler-times "pmovsxbq" 2 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times "pmovsxbq" 2 } } */ void foo_s16_s32 (v8si * dst, v16hi * __restrict src) diff --git a/gcc/testsuite/gcc.target/i386/pr92658-avx2.c b/gcc/testsuite/gcc.target/i386/pr92658-avx2.c index 21fa3e5..70b5bdb 100644 --- a/gcc/testsuite/gcc.target/i386/pr92658-avx2.c +++ b/gcc/testsuite/gcc.target/i386/pr92658-avx2.c @@ -109,7 +109,7 @@ bar_u8_u64 (v4di * dst, v32qi src) dst[0] = *(v4di *) tem; } -/* { dg-final { scan-assembler-times "pmovzxbq" 2 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times "pmovzxbq" 2 } } */ void foo_u16_u32 (v8si * dst, v16hi * __restrict src) diff --git a/gcc/testsuite/gcc.target/i386/pr92658-avx512vl.c b/gcc/testsuite/gcc.target/i386/pr92658-avx512vl.c index dc50084..a9f7d7e 100644 --- a/gcc/testsuite/gcc.target/i386/pr92658-avx512vl.c +++ b/gcc/testsuite/gcc.target/i386/pr92658-avx512vl.c @@ -126,4 +126,4 @@ truncdb_128 (v16qi * dst, v4si * __restrict src) /* { dg-final { scan-assembler-times "vpmovqb" 2 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times "vpmovdw" 1 } } */ /* { dg-final { scan-assembler-times "vpmovdw" 2 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times "vpmovdb" 2 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr92658-sse4-2.c b/gcc/testsuite/gcc.target/i386/pr92658-sse4-2.c index ca174ce..53e89ad 100644 --- a/gcc/testsuite/gcc.target/i386/pr92658-sse4-2.c +++ b/gcc/testsuite/gcc.target/i386/pr92658-sse4-2.c @@ -61,7 +61,7 @@ bar_s8_s32 (v4si * dst, v16qi src) dst[0] = *(v4si *) tem; } -/* { dg-final { scan-assembler-times "pmovsxbd" 2 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times "pmovsxbd" 2 } } */ void foo_s8_s64 (v2di * dst, v16qi * __restrict src) @@ -125,7 +125,7 @@ bar_s16_s64 (v2di * dst, v8hi src) dst[0] = *(v2di *) tem; } -/* { dg-final { scan-assembler-times "pmovsxwq" 2 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times "pmovsxwq" 2 } } */ void foo_s32_s64 (v2di * dst, v4si * __restrict src) diff --git a/gcc/testsuite/gcc.target/i386/pr92658-sse4.c b/gcc/testsuite/gcc.target/i386/pr92658-sse4.c index e462629..e12e163 100644 --- a/gcc/testsuite/gcc.target/i386/pr92658-sse4.c +++ b/gcc/testsuite/gcc.target/i386/pr92658-sse4.c @@ -61,7 +61,7 @@ bar_u8_u32 (v4si * dst, v16qi src) dst[0] = *(v4si *) tem; } -/* { dg-final { scan-assembler-times "pmovzxbd" 2 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times "pmovzxbd" 2 } } */ void foo_u8_u64 (v2di * dst, v16qi * __restrict src) @@ -125,7 +125,7 @@ bar_u16_u64 (v2di * dst, v8hi src) dst[0] = *(v2di *) tem; } -/* { dg-final { scan-assembler-times "pmovzxwq" 2 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times "pmovzxwq" 2 } } */ void foo_u32_u64 (v2di * dst, v4si * __restrict src) -- 2.7.4