From 95e1eca43d106d821720744ac6ff1f5df41a1e78 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 11 Aug 2021 14:00:00 +0800 Subject: [PATCH] Combine avx_vec_concatv16si and avx512f_zero_extendv16hiv16si2_1 to avx512f_zero_extendv16hiv16si2_2. Add define_insn_and_split to combine avx_vec_concatv16si/2 and avx512f_zero_extendv16hiv16si2_1 since the latter already zero_extend the upper bits, similar for other patterns which are related to pmovzx{bw,wd,dq}. It will do optimization like - vmovdqa %ymm0, %ymm0 # 7 [c=4 l=6] avx_vec_concatv16si/2 vpmovzxwd %ymm0, %zmm0 # 22 [c=4 l=6] avx512f_zero_extendv16hiv16si2 ret # 25 [c=0 l=1] simple_return_internal gcc/ChangeLog: PR target/101846 * config/i386/sse.md (*avx2_zero_extendv16qiv16hi2_2): New post_reload define_insn_and_split. (*avx512bw_zero_extendv32qiv32hi2_2): Ditto. (*sse4_1_zero_extendv8qiv8hi2_4): Ditto. (*avx512f_zero_extendv16hiv16si2_2): Ditto. (*avx2_zero_extendv8hiv8si2_2): Ditto. (*sse4_1_zero_extendv4hiv4si2_4): Ditto. (*avx512f_zero_extendv8siv8di2_2): Ditto. (*avx2_zero_extendv4siv4di2_2): Ditto. (*sse4_1_zero_extendv2siv2di2_4): Ditto. (VI248_256, VI248_512, VI148_512, VI148_256, VI148_128): New mode iterator. gcc/testsuite/ChangeLog: PR target/101846 * gcc.target/i386/pr101846-1.c: New test. --- gcc/config/i386/sse.md | 219 +++++++++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr101846-1.c | 95 +++++++++++++ 2 files changed, 314 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-1.c diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 3957c86..3a7bbae 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -681,7 +681,12 @@ (define_mode_iterator VI124_128 [V16QI V8HI V4SI]) (define_mode_iterator VI24_128 [V8HI V4SI]) (define_mode_iterator VI248_128 [V8HI V4SI V2DI]) +(define_mode_iterator VI248_256 [V16HI V8SI V4DI]) +(define_mode_iterator VI248_512 [V32HI V16SI V8DI]) (define_mode_iterator VI48_128 [V4SI V2DI]) +(define_mode_iterator VI148_512 [V64QI V16SI V8DI]) +(define_mode_iterator VI148_256 [V32QI V8SI V4DI]) +(define_mode_iterator VI148_128 [V16QI V4SI V2DI]) ;; Various 256bit and 512 vector integer mode combinations (define_mode_iterator VI124_256 [V32QI V16HI V8SI]) @@ -18603,6 +18608,26 @@ operands[1] = lowpart_subreg (V16QImode, operands[1], V32QImode); }) +(define_insn_and_split "*avx2_zero_extendv16qiv16hi2_2" + [(set (match_operand:V32QI 0 "register_operand" "=v") + (vec_select:V32QI + (vec_concat:V64QI + (subreg:V32QI + (vec_concat:VI248_256 + (match_operand: 1 "nonimmediate_operand" "vm") + (match_operand: 2 "const0_operand" "C")) 0) + (match_operand:V32QI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX2" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V16HI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V16HImode, operands[0], V32QImode); + operands[1] = lowpart_subreg (V16QImode, operands[1], mode); +}) + (define_expand "v16qiv16hi2" [(set (match_operand:V16HI 0 "register_operand") (any_extend:V16HI @@ -18637,6 +18662,26 @@ operands[1] = lowpart_subreg (V32QImode, operands[1], V64QImode); }) +(define_insn_and_split "*avx512bw_zero_extendv32qiv32hi2_2" + [(set (match_operand:V64QI 0 "register_operand" "=v") + (vec_select:V64QI + (vec_concat:V128QI + (subreg:V64QI + (vec_concat:VI248_512 + (match_operand: 1 "nonimmediate_operand" "vm") + (match_operand: 2 "const0_operand" "C")) 0) + (match_operand:V64QI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX512BW" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V32HI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V32HImode, operands[0], V64QImode); + operands[1] = lowpart_subreg (V32QImode, operands[1], mode); +}) + (define_expand "v32qiv32hi2" [(set (match_operand:V32HI 0 "register_operand") (any_extend:V32HI @@ -18723,6 +18768,41 @@ } [(set_attr "isa" "noavx,noavx,avx")]) +(define_insn_and_split "*sse4_1_zero_extendv8qiv8hi2_4" + [(set (match_operand:V16QI 0 "register_operand" "=Yr,*x,Yw") + (vec_select:V16QI + (vec_concat:V32QI + (subreg:V16QI + (vec_concat:VI248_128 + (match_operand: 1 "vector_operand" "YrBm,*xBm,Ywm") + (match_operand: 2 "const0_operand" "C,C,C")) 0) + (match_operand:V16QI 3 "const0_operand" "C,C,C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n,n,n")])))] + "TARGET_SSE4_1" + "#" + "&& reload_completed" + [(set (match_dup 0) + (zero_extend:V8HI + (vec_select:V8QI + (match_dup 1) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7)]))))] +{ + operands[0] = lowpart_subreg (V8HImode, operands[0], V16QImode); + if (MEM_P (operands[1])) + { + operands[1] = lowpart_subreg (V8QImode, operands[1], mode); + operands[1] = gen_rtx_ZERO_EXTEND (V8HImode, operands[1]); + emit_insn (gen_rtx_SET (operands[0], operands[1])); + DONE; + } + operands[1] = lowpart_subreg (V16QImode, operands[1], mode); +} + [(set_attr "isa" "noavx,noavx,avx")]) + (define_expand "v8qiv8hi2" [(set (match_operand:V8HI 0 "register_operand") (any_extend:V8HI @@ -18913,6 +18993,26 @@ operands[1] = lowpart_subreg (V16HImode, operands[1], V32HImode); }) +(define_insn_and_split "*avx512f_zero_extendv16hiv16si2_2" + [(set (match_operand:V32HI 0 "register_operand" "=v") + (vec_select:V32HI + (vec_concat:V64HI + (subreg:V32HI + (vec_concat:VI148_512 + (match_operand: 1 "nonimmediate_operand" "vm") + (match_operand: 2 "const0_operand" "C")) 0) + (match_operand:V32HI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX512F" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V16SI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V16SImode, operands[0], V32HImode); + operands[1] = lowpart_subreg (V16HImode, operands[1], mode); +}) + (define_insn "avx2_v8hiv8si2" [(set (match_operand:V8SI 0 "register_operand" "=v") (any_extend:V8SI @@ -18947,6 +19047,27 @@ operands[1] = lowpart_subreg (V8HImode, operands[1], V16HImode); }) +(define_insn_and_split "*avx2_zero_extendv8hiv8si2_2" + [(set (match_operand:V16HI 0 "register_operand" "=v") + (vec_select:V16HI + (vec_concat:V32HI + (subreg:V16HI + (vec_concat:VI148_256 + (match_operand: 1 "nonimmediate_operand" "vm") + (match_operand: 2 "const0_operand" "C")) 0) + (match_operand:V16HI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX2" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V8SI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V8SImode, operands[0], V16HImode); + operands[1] = lowpart_subreg (V8HImode, operands[1], mode); +}) + + (define_insn "sse4_1_v4hiv4si2" [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v") (any_extend:V4SI @@ -19036,6 +19157,39 @@ } [(set_attr "isa" "noavx,noavx,avx")]) +(define_insn_and_split "*sse4_1_zero_extendv4hiv4si2_4" + [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v") + (vec_select:V8HI + (vec_concat:V16HI + (subreg:V8HI + (vec_concat:VI148_128 + (match_operand: 1 "vector_operand" "YrBm,*xBm,vm") + (match_operand: 2 "const0_operand" "C,C,C")) 0) + (match_operand:V8HI 3 "const0_operand" "C,C,C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n,n,n")])))] + "TARGET_SSE4_1" + "#" + "&& reload_completed" + [(set (match_dup 0) + (zero_extend:V4SI + (vec_select:V4HI + (match_dup 1) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))))] +{ + operands[0] = lowpart_subreg (V4SImode, operands[0], V8HImode); + if (MEM_P (operands[1])) + { + operands[1] = lowpart_subreg (V4HImode, operands[1], mode); + operands[1] = gen_rtx_ZERO_EXTEND (V4SImode, operands[1]); + emit_insn (gen_rtx_SET (operands[0], operands[1])); + DONE; + } + operands[1] = lowpart_subreg (V8HImode, operands[1], mode); +} + [(set_attr "isa" "noavx,noavx,avx")]) + (define_insn "avx512f_v8qiv8di2" [(set (match_operand:V8DI 0 "register_operand" "=v") (any_extend:V8DI @@ -19346,6 +19500,24 @@ operands[1] = lowpart_subreg (V8SImode, operands[1], V16SImode); }) +(define_insn_and_split "*avx512f_zero_extendv8siv8di2_2" + [(set (match_operand:V16SI 0 "register_operand" "=v") + (vec_select:V16SI + (vec_concat:V32SI + (vec_concat:V16SI + (match_operand:V8SI 1 "nonimmediate_operand" "vm") + (match_operand:V8SI 2 "const0_operand" "C")) + (match_operand:V16SI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX512F" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V8DI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V8DImode, operands[0], V16SImode); +}) + (define_expand "v8siv8di2" [(set (match_operand:V8DI 0 "register_operand" "=v") (any_extend:V8DI @@ -19380,6 +19552,24 @@ operands[1] = lowpart_subreg (V4SImode, operands[1], V8SImode); }) +(define_insn_and_split "*avx2_zero_extendv4siv4di2_2" + [(set (match_operand:V8SI 0 "register_operand" "=v") + (vec_select:V8SI + (vec_concat:V16SI + (vec_concat:V8SI + (match_operand:V4SI 1 "nonimmediate_operand" "vm") + (match_operand:V4SI 2 "const0_operand" "C")) + (match_operand:V8SI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX2" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V4DI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V4DImode, operands[0], V8SImode); +}) + (define_expand "v4siv4di2" [(set (match_operand:V4DI 0 "register_operand") (any_extend:V4DI @@ -19456,6 +19646,35 @@ } [(set_attr "isa" "noavx,noavx,avx")]) +(define_insn_and_split "*sse4_1_zero_extendv2siv2di2_4" + [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v") + (vec_select:V4SI + (vec_concat:V8SI + (vec_concat:V4SI + (match_operand:V2SI 1 "vector_operand" "YrBm, *xBm, vm") + (match_operand:V2SI 2 "const0_operand" "C,C,C")) + (match_operand:V4SI 3 "const0_operand" "C,C,C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n,n,n")])))] + "TARGET_SSE4_1" + "#" + "&& reload_completed" + [(set (match_dup 0) + (zero_extend:V2DI + (vec_select:V2SI (match_dup 1) + (parallel [(const_int 0) (const_int 1)]))))] +{ + operands[0] = lowpart_subreg (V2DImode, operands[0], V4SImode); + if (MEM_P (operands[1])) + { + operands[1] = gen_rtx_ZERO_EXTEND (V2DImode, operands[1]); + emit_insn (gen_rtx_SET (operands[0], operands[1])); + DONE; + } + operands[1] = lowpart_subreg (V4SImode, operands[1], V2SImode); +} + [(set_attr "isa" "noavx,noavx,avx")]) + (define_expand "v2siv2di2" [(set (match_operand:V2DI 0 "register_operand") (any_extend:V2DI diff --git a/gcc/testsuite/gcc.target/i386/pr101846-1.c b/gcc/testsuite/gcc.target/i386/pr101846-1.c new file mode 100644 index 0000000..40d95bd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101846-1.c @@ -0,0 +1,95 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mavx512vl -mavx512dq -O2" } */ +/* { dg-final { scan-assembler-not "vmov" } } */ +/* { dg-final { scan-assembler-times "vpmovzxbw" "3" } } */ +/* { dg-final { scan-assembler-times "vpmovzxwd" "3" } } */ +/* { dg-final { scan-assembler-times "vpmovzxdq" "3" } } */ + +typedef short v4hi __attribute__((vector_size (8))); +typedef short v8hi __attribute__((vector_size (16))); +typedef short v16hi __attribute__((vector_size (32))); +typedef short v32hi __attribute__((vector_size (64))); +typedef char v8qi __attribute__((vector_size (8))); +typedef char v16qi __attribute__((vector_size (16))); +typedef char v32qi __attribute__((vector_size (32))); +typedef char v64qi __attribute__((vector_size (64))); +typedef int v2si __attribute__((vector_size (8))); +typedef int v4si __attribute__((vector_size (16))); +typedef int v8si __attribute__((vector_size (32))); +typedef int v16si __attribute__((vector_size (64))); + +v32hi +foo_zxwd_512 (v16hi x) +{ + return __builtin_shufflevector (x, (v16hi) {}, + 0, 16, 1, 17, 2, 18, 3, 19, + 4, 20, 5, 21, 6, 22, 7, 23, + 8, 24, 9, 25, 10, 26, 11, 27, + 12, 28, 13, 29, 14, 30, 15, 31); +} + +v16hi +foo_zxwd_256 (v8hi x) +{ + return __builtin_shufflevector (x, (v8hi) {}, + 0, 8, 1, 9, 2, 10, 3, 11, + 4, 12, 5, 13, 6, 14, 7, 15); +} + +v8hi +foo_zxwd_128 (v4hi x) +{ + return __builtin_shufflevector (x, (v4hi) {}, 0, 4, 1, 5, 2, 6, 3, 7); +} + +v16si +foo_zxdq_512 (v8si x) +{ + return __builtin_shufflevector (x, (v8si) {}, + 0, 8, 1, 9, 2, 10, 3, 11, + 4, 12, 5, 13, 6, 14, 7, 15); +} + +v8si +foo_zxdq_256 (v4si x) +{ + return __builtin_shufflevector (x, (v4si) {}, 0, 4, 1, 5, 2, 6, 3, 7); +} + +v4si +foo_zxdq_128 (v2si x) +{ + return __builtin_shufflevector (x, (v2si) {}, 0, 2, 1, 3); +} + +v64qi +foo_zxbw_512 (v32qi x) +{ + return __builtin_shufflevector (x, (v32qi) {}, + 0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47, + 16, 48, 17, 49, 18, 50, 19, 51, + 20, 52, 21, 53, 22, 54, 23, 55, + 24, 56, 25, 57, 26, 58, 27, 59, + 28, 60, 29, 61, 30, 62, 31, 63); +} + +v32qi +foo_zxbw_256 (v16qi x) +{ + return __builtin_shufflevector (x, (v16qi) {}, + 0, 16, 1, 17, 2, 18, 3, 19, + 4, 20, 5, 21, 6, 22, 7, 23, + 8, 24, 9, 25, 10, 26, 11, 27, + 12, 28, 13, 29, 14, 30, 15, 31); +} + +v16qi +foo_zxbw_128 (v8qi x) +{ + return __builtin_shufflevector (x, (v8qi) {}, + 0, 8, 1, 9, 2, 10, 3, 11, + 4, 12, 5, 13, 6, 14, 7, 15); +} -- 2.7.4