From 57052c6ed59c1a2ee4a67982f960e08593956955 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 15 Mar 2023 20:33:48 +0100 Subject: [PATCH] i386: Fix blend vector permutation for 8-byte modes 8-byte modes should be processed only for TARGET_MMX_WITH_SSE. Handle V2SFmode and fix V2HImode handling. The resulting BLEND instructions are always faster than MOVSS/MOVSD, so prioritize them w.r.t MOVSS/MOVSD for TARGET_SSE4_1. gcc/ChangeLog: * config/i386/i386-expand.cc (expand_vec_perm_blend): Handle 8-byte modes only with TARGET_MMX_WITH_SSE. Handle V2SFmode and fix V2HImode handling. (expand_vec_perm_1): Try to emit BLEND instruction before MOVSS/MOVSD. * config/i386/mmx.md (*mmx_blendps): New insn pattern. gcc/testsuite/ChangeLog: * gcc.target/i386/merge-1.c (dg-options): Use -mno-sse4. * gcc.target/i386/sse2-mmx-21.c (dg-options): Ditto. * gcc.target/i386/sse-movss-4.c (dg-options): Use -mno-sse4. Simplify scan-assembler-not strings. * gcc.target/i386/sse2-movsd-3.c (dg-options): Ditto. * gcc.target/i386/sse2-mmx-movss-1.c: New test. --- gcc/config/i386/i386-expand.cc | 19 +++++++++++-------- gcc/config/i386/mmx.md | 19 +++++++++++++++++++ gcc/testsuite/gcc.target/i386/merge-1.c | 2 +- gcc/testsuite/gcc.target/i386/sse-movss-4.c | 6 +++--- gcc/testsuite/gcc.target/i386/sse2-mmx-21.c | 2 +- gcc/testsuite/gcc.target/i386/sse2-mmx-movss-1.c | 13 +++++++++++++ gcc/testsuite/gcc.target/i386/sse2-movsd-3.c | 10 +++++----- 7 files changed, 53 insertions(+), 18 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/sse2-mmx-movss-1.c diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index e89abf2..1545d43 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -19007,9 +19007,10 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) ; else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) ; - else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16 - || GET_MODE_SIZE (vmode) == 8 - || GET_MODE_SIZE (vmode) == 4)) + else if (TARGET_SSE4_1 + && (GET_MODE_SIZE (vmode) == 16 + || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8) + || GET_MODE_SIZE (vmode) == 4)) ; else return false; @@ -19042,6 +19043,8 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) case E_V8SFmode: case E_V2DFmode: case E_V4SFmode: + case E_V2SFmode: + case E_V2HImode: case E_V4HImode: case E_V8HImode: case E_V8SImode: @@ -19897,11 +19900,15 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) } } + /* Try the SSE4.1 blend variable merge instructions. */ + if (expand_vec_perm_blend (d)) + return true; + /* Try movss/movsd instructions. */ if (expand_vec_perm_movs (d)) return true; - /* Finally, try the fully general two operand permute. */ + /* Try the fully general two operand permute. */ if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt, d->testing_p)) return true; @@ -19924,10 +19931,6 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) return true; } - /* Try the SSE4.1 blend variable merge instructions. */ - if (expand_vec_perm_blend (d)) - return true; - /* Try one of the AVX vpermil variable permutations. */ if (expand_vec_perm_vpermil (d)) return true; diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index f9c6611..18dae03 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -1154,6 +1154,25 @@ DONE; }) +(define_insn "*mmx_blendps" + [(set (match_operand:V2SF 0 "register_operand" "=Yr,*x,x") + (vec_merge:V2SF + (match_operand:V2SF 2 "register_operand" "Yr,*x,x") + (match_operand:V2SF 1 "register_operand" "0,0,x") + (match_operand:SI 3 "const_0_to_3_operand")))] + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" + "@ + blendps\t{%3, %2, %0|%0, %2, %3} + blendps\t{%3, %2, %0|%0, %2, %3} + vblendps\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "length_immediate" "1") + (set_attr "prefix_data16" "1,1,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,vex") + (set_attr "mode" "V4SF")]) + (define_insn "mmx_blendvps" [(set (match_operand:V2SF 0 "register_operand" "=Yr,*x,x") (unspec:V2SF diff --git a/gcc/testsuite/gcc.target/i386/merge-1.c b/gcc/testsuite/gcc.target/i386/merge-1.c index d525685..b018eb1 100644 --- a/gcc/testsuite/gcc.target/i386/merge-1.c +++ b/gcc/testsuite/gcc.target/i386/merge-1.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O1 -msse2" } */ +/* { dg-options "-O1 -msse2 -mno-sse4" } */ #include diff --git a/gcc/testsuite/gcc.target/i386/sse-movss-4.c b/gcc/testsuite/gcc.target/i386/sse-movss-4.c index ec3019c..d8a8a03 100644 --- a/gcc/testsuite/gcc.target/i386/sse-movss-4.c +++ b/gcc/testsuite/gcc.target/i386/sse-movss-4.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -msse" } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ typedef unsigned int v4si __attribute__((vector_size(16))); typedef float v4sf __attribute__((vector_size(16))); @@ -7,7 +7,7 @@ typedef float v4sf __attribute__((vector_size(16))); v4si foo(v4si x,v4si y) { return (v4si){y[0],x[1],x[2],x[3]}; } v4sf bar(v4sf x,v4sf y) { return (v4sf){y[0],x[1],x[2],x[3]}; } -/* { dg-final { scan-assembler-times "\tv?movss\t" 2 } } */ +/* { dg-final { scan-assembler-times "\tmovss\t" 2 } } */ /* { dg-final { scan-assembler-not "movaps" } } */ /* { dg-final { scan-assembler-not "shufps" } } */ -/* { dg-final { scan-assembler-not "vpblendw" } } */ +/* { dg-final { scan-assembler-not "pblendw" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse2-mmx-21.c b/gcc/testsuite/gcc.target/i386/sse2-mmx-21.c index 8f5341e..7f8098a 100644 --- a/gcc/testsuite/gcc.target/i386/sse2-mmx-21.c +++ b/gcc/testsuite/gcc.target/i386/sse2-mmx-21.c @@ -1,5 +1,5 @@ /* { dg-do compile { target { ! ia32 } } } */ -/* { dg-options "-O2 -msse2 -mno-mmx" } */ +/* { dg-options "-O2 -msse2 -mno-mmx -mno-sse4" } */ /* { dg-final { scan-assembler-times "pshufd" 1 } } */ /* { dg-final { scan-assembler-times "movd" 1 } } */ /* { dg-final { scan-assembler-not "%mm" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse2-mmx-movss-1.c b/gcc/testsuite/gcc.target/i386/sse2-mmx-movss-1.c new file mode 100644 index 0000000..bb79628 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-mmx-movss-1.c @@ -0,0 +1,13 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -msse2 -mno-sse4" } */ + +typedef unsigned int v2si __attribute__((vector_size(8))); +typedef float v2sf __attribute__((vector_size(8))); + +v2si foo(v2si x,v2si y) { return (v2si){y[0],x[1]}; } +v2sf bar(v2sf x,v2sf y) { return (v2sf){y[0],x[1]}; } + +/* { dg-final { scan-assembler-times "\tmovss\t" 2 } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ +/* { dg-final { scan-assembler-not "pblendw" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse2-movsd-3.c b/gcc/testsuite/gcc.target/i386/sse2-movsd-3.c index fadbe2b..edd4a44 100644 --- a/gcc/testsuite/gcc.target/i386/sse2-movsd-3.c +++ b/gcc/testsuite/gcc.target/i386/sse2-movsd-3.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -msse2" } */ +/* { dg-options "-O2 -msse2 -mno-sse4" } */ typedef unsigned long long v2di __attribute__((vector_size(16))); typedef double v2df __attribute__((vector_size(16))); @@ -7,9 +7,9 @@ typedef double v2df __attribute__((vector_size(16))); v2di foo(v2di x,v2di y) { return (v2di){y[0],x[1]}; } v2df bar(v2df x,v2df y) { return (v2df){y[0],x[1]}; } -/* { dg-final { scan-assembler-times "\tv?movsd\t" 2 } } */ -/* { dg-final { scan-assembler-not "v?shufpd" } } */ +/* { dg-final { scan-assembler-times "\tmovsd\t" 2 } } */ +/* { dg-final { scan-assembler-not "shufpd" } } */ /* { dg-final { scan-assembler-not "movdqa" } } */ /* { dg-final { scan-assembler-not "pshufd" } } */ -/* { dg-final { scan-assembler-not "v?punpckldq" } } */ -/* { dg-final { scan-assembler-not "v?movq" } } */ +/* { dg-final { scan-assembler-not "punpckldq" } } */ +/* { dg-final { scan-assembler-not "movq" } } */ -- 2.7.4