From 011464ede0edbcb5512bf24374fe8805f654d82e Mon Sep 17 00:00:00 2001 From: Hongtao Liu Date: Sat, 26 Oct 2019 02:34:34 +0000 Subject: [PATCH] Fix false dependence of scalar operation vrcp/vsqrt/vrsqrt/vrndscale For instructions with xmm operand: op %xmmN,%xmmQ,%xmmQ ----> op %xmmN, %xmmN, %xmmQ for instruction with mem operand or gpr operand: op mem/gpr, %xmmQ, %xmmQ ---> using pass rpad ----> xorps %xmmN, %xmmN, %xxN op mem/gpr, %xmmN, %xmmQ Performance influence of SPEC2017 fprate which is tested on SKX ---- 503.bwaves_r -0.03% 507.cactuBSSN_r -0.22% 508.namd_r -0.02% 510.parest_r 0.37% 511.povray_r 0.74% 519.lbm_r 0.24% 521.wrf_r 2.35% 526.blender_r 0.71% 527.cam4_r 0.65% 538.imagick_r 0.95% 544.nab_r -0.37 549.fotonik3d_r 0.24% 554.roms_r 0.90% fprate geomean 0.50% ----- Changelog gcc/ * config/i386/i386.md (*rcpsf2_sse): Add avx_partial_xmm_update, prefer m constraint for TARGET_AVX. (*rsqrtsf2_sse): Ditto. (*sqrt2_sse): Ditto. (sse4_1_round2): separate constraint vm, add avx_partail_xmm_update, prefer m constraint for TARGET_AVX. * config/i386/sse.md (*sse_vmrcpv4sf2"): New define_insn used by pass rpad. (*_vmsqrt2*): Ditto. (*sse_vmrsqrtv4sf2): Ditto. (*avx512f_rndscale): Ditto. (*sse4_1_round): Ditto. (sse4_1_round): Add m constraint and pointer size modifier since vround support memory operand. gcc/testsuite * gcc.target/i386/pr87007-4.c: New test. * gcc.target/i386/pr87007-5.c: Ditto. From-SVN: r277469 --- gcc/ChangeLog | 19 ++++++ gcc/config/i386/i386.md | 67 +++++++++++---------- gcc/config/i386/sse.md | 97 ++++++++++++++++++++++++++++++- gcc/testsuite/ChangeLog | 6 ++ gcc/testsuite/gcc.target/i386/pr87007-4.c | 18 ++++++ gcc/testsuite/gcc.target/i386/pr87007-5.c | 18 ++++++ 6 files changed, 193 insertions(+), 32 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr87007-4.c create mode 100644 gcc/testsuite/gcc.target/i386/pr87007-5.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 555cf54..7015e1b 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,22 @@ +2019-10-26 Hongtao Liu + + PR target/89071 + * config/i386/i386.md (*rcpsf2_sse): Add + avx_partial_xmm_update, prefer m constraint for TARGET_AVX. + (*rsqrtsf2_sse): Ditto. + (*sqrt2_sse): Ditto. + (sse4_1_round2): separate constraint vm, add + avx_partail_xmm_update, prefer m constraint for TARGET_AVX. + * config/i386/sse.md (*sse_vmrcpv4sf2"): New define_insn used + by pass rpad. + (*_vmsqrt2*): + Ditto. + (*sse_vmrsqrtv4sf2): Ditto. + (*avx512f_rndscale): Ditto. + (*sse4_1_round): Ditto. + (sse4_1_round): Add m constraint and + pointer size modifier since vround support memory operand. + 2019-10-18 Georg-Johann Lay PR target/85969 diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 5e07959..fb2235a 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -14843,13 +14843,14 @@ (set_attr "btver2_sse_attr" "rcp") (set_attr "prefix" "maybe_vex") (set_attr "mode" "SF") + (set_attr "avx_partial_xmm_update" "false,false,true") (set (attr "preferred_for_speed") - (cond [(eq_attr "alternative" "1") - (symbol_ref "TARGET_AVX || !TARGET_SSE_PARTIAL_REG_DEPENDENCY") - (eq_attr "alternative" "2") - (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY") - ] - (symbol_ref "true")))]) + (cond [(match_test "TARGET_AVX") + (symbol_ref "true") + (eq_attr "alternative" "1,2") + (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY") + ] + (symbol_ref "true")))]) (define_insn "*fop_xf_1_i387" [(set (match_operand:XF 0 "register_operand" "=f,f") @@ -15089,13 +15090,14 @@ (set_attr "btver2_sse_attr" "rcp") (set_attr "prefix" "maybe_vex") (set_attr "mode" "SF") + (set_attr "avx_partial_xmm_update" "false,false,true") (set (attr "preferred_for_speed") - (cond [(eq_attr "alternative" "1") - (symbol_ref "TARGET_AVX || !TARGET_SSE_PARTIAL_REG_DEPENDENCY") - (eq_attr "alternative" "2") - (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY") - ] - (symbol_ref "true")))]) + (cond [(match_test "TARGET_AVX") + (symbol_ref "true") + (eq_attr "alternative" "1,2") + (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY") + ] + (symbol_ref "true")))]) (define_expand "rsqrtsf2" [(set (match_operand:SF 0 "register_operand") @@ -15120,14 +15122,15 @@ (set_attr "atom_sse_attr" "sqrt") (set_attr "btver2_sse_attr" "sqrt") (set_attr "prefix" "maybe_vex") + (set_attr "avx_partial_xmm_update" "false,false,true") (set_attr "mode" "") (set (attr "preferred_for_speed") - (cond [(eq_attr "alternative" "1") - (symbol_ref "TARGET_AVX || !TARGET_SSE_PARTIAL_REG_DEPENDENCY") - (eq_attr "alternative" "2") - (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY") - ] - (symbol_ref "true")))]) + (cond [(match_test "TARGET_AVX") + (symbol_ref "true") + (eq_attr "alternative" "1,2") + (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY") + ] + (symbol_ref "true")))]) (define_expand "sqrt2" [(set (match_operand:MODEF 0 "register_operand") @@ -16261,30 +16264,32 @@ (define_insn "sse4_1_round2" - [(set (match_operand:MODEF 0 "register_operand" "=x,x,x,v") + [(set (match_operand:MODEF 0 "register_operand" "=x,x,x,v,v") (unspec:MODEF - [(match_operand:MODEF 1 "nonimmediate_operand" "0,x,m,vm") - (match_operand:SI 2 "const_0_to_15_operand" "n,n,n,n")] + [(match_operand:MODEF 1 "nonimmediate_operand" "0,x,m,v,m") + (match_operand:SI 2 "const_0_to_15_operand" "n,n,n,n,n")] UNSPEC_ROUND))] "TARGET_SSE4_1" "@ %vround\t{%2, %d1, %0|%0, %d1, %2} %vround\t{%2, %d1, %0|%0, %d1, %2} %vround\t{%2, %1, %d0|%d0, %1, %2} + vrndscale\t{%2, %d1, %0|%0, %d1, %2} vrndscale\t{%2, %1, %d0|%d0, %1, %2}" [(set_attr "type" "ssecvt") - (set_attr "prefix_extra" "1,1,1,*") - (set_attr "length_immediate" "*,*,*,1") - (set_attr "prefix" "maybe_vex,maybe_vex,maybe_vex,evex") - (set_attr "isa" "noavx512f,noavx512f,noavx512f,avx512f") + (set_attr "prefix_extra" "1,1,1,*,*") + (set_attr "length_immediate" "*,*,*,1,1") + (set_attr "prefix" "maybe_vex,maybe_vex,maybe_vex,evex,evex") + (set_attr "isa" "noavx512f,noavx512f,noavx512f,avx512f,avx512f") + (set_attr "avx_partial_xmm_update" "false,false,true,false,true") (set_attr "mode" "") (set (attr "preferred_for_speed") - (cond [(eq_attr "alternative" "1") - (symbol_ref "TARGET_AVX || !TARGET_SSE_PARTIAL_REG_DEPENDENCY") - (eq_attr "alternative" "2") - (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY") - ] - (symbol_ref "true")))]) + (cond [(match_test "TARGET_AVX") + (symbol_ref "true") + (eq_attr "alternative" "1,2") + (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY") + ] + (symbol_ref "true")))]) (define_insn "rintxf2" [(set (match_operand:XF 0 "register_operand" "=f") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 403e91d..ce0dccf 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -2035,6 +2035,25 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "SF")]) +(define_insn "*sse_vmrcpv4sf2" + [(set (match_operand:V4SF 0 "register_operand" "=x,x") + (vec_merge:V4SF + (vec_duplicate:V4SF + (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm,xm")] + UNSPEC_RCP)) + (match_operand:V4SF 2 "register_operand" "0,x") + (const_int 1)))] + "TARGET_SSE" + "@ + rcpss\t{%1, %0|%0, %1} + vrcpss\t{%1, %2, %0|%0, %2, %1}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sse") + (set_attr "atom_sse_attr" "rcp") + (set_attr "btver2_sse_attr" "rcp") + (set_attr "prefix" "orig,vex") + (set_attr "mode" "SF")]) + (define_insn "rcp14" [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v") (unspec:VF_AVX512VL @@ -2130,6 +2149,25 @@ (set_attr "btver2_sse_attr" "sqrt") (set_attr "mode" "")]) +(define_insn "*_vmsqrt2" + [(set (match_operand:VF_128 0 "register_operand" "=x,v") + (vec_merge:VF_128 + (vec_duplicate:VF_128 + (sqrt: + (match_operand: 1 "nonimmediate_operand" "xm,"))) + (match_operand:VF_128 2 "register_operand" "0,v") + (const_int 1)))] + "TARGET_SSE" + "@ + sqrt\t{%1, %0|%0, %1} + vsqrt\t{%1, %2, %0|%0, %2, %1}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sse") + (set_attr "atom_sse_attr" "sqrt") + (set_attr "prefix" "") + (set_attr "btver2_sse_attr" "sqrt") + (set_attr "mode" "")]) + (define_expand "rsqrt2" [(set (match_operand:VF1_128_256 0 "register_operand") (unspec:VF1_128_256 @@ -2219,6 +2257,23 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "SF")]) +(define_insn "*sse_vmrsqrtv4sf2" + [(set (match_operand:V4SF 0 "register_operand" "=x,x") + (vec_merge:V4SF + (vec_duplicate:V4SF + (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm,xm")] + UNSPEC_RSQRT)) + (match_operand:V4SF 2 "register_operand" "0,x") + (const_int 1)))] + "TARGET_SSE" + "@ + rsqrtss\t{%1, %0|%0, %1} + vrsqrtss\t{%1, %2, %0|%0, %2, %1}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sse") + (set_attr "prefix" "orig,vex") + (set_attr "mode" "SF")]) + (define_expand "3" [(set (match_operand:VF 0 "register_operand") (smaxmin:VF @@ -9709,6 +9764,22 @@ (set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_insn "*avx512f_rndscale" + [(set (match_operand:VF_128 0 "register_operand" "=v") + (vec_merge:VF_128 + (vec_duplicate:VF_128 + (unspec: + [(match_operand: 2 "" "") + (match_operand:SI 3 "const_0_to_255_operand")] + UNSPEC_ROUND)) + (match_operand:VF_128 1 "register_operand" "v") + (const_int 1)))] + "TARGET_AVX512F" + "vrndscale\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + ;; One bit in mask selects 2 elements. (define_insn "avx512f_shufps512_1" [(set (match_operand:V16SF 0 "register_operand" "=v") @@ -17954,13 +18025,37 @@ [(set (match_operand:VF_128 0 "register_operand" "=Yr,*x,x,v") (vec_merge:VF_128 (unspec:VF_128 - [(match_operand:VF_128 2 "register_operand" "Yr,*x,x,v") + [(match_operand:VF_128 2 "nonimmediate_operand" "Yrm,*xm,xm,vm") (match_operand:SI 3 "const_0_to_15_operand" "n,n,n,n")] UNSPEC_ROUND) (match_operand:VF_128 1 "register_operand" "0,0,x,v") (const_int 1)))] "TARGET_SSE4_1" "@ + round\t{%3, %2, %0|%0, %2, %3} + round\t{%3, %2, %0|%0, %2, %3} + vround\t{%3, %2, %1, %0|%0, %1, %2, %3} + vrndscale\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "noavx,noavx,avx,avx512f") + (set_attr "type" "ssecvt") + (set_attr "length_immediate" "1") + (set_attr "prefix_data16" "1,1,*,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,vex,evex") + (set_attr "mode" "")]) + +(define_insn "*sse4_1_round" + [(set (match_operand:VF_128 0 "register_operand" "=Yr,*x,x,v") + (vec_merge:VF_128 + (vec_duplicate:VF_128 + (unspec: + [(match_operand: 2 "nonimmediate_operand" "Yrm,*xm,xm,vm") + (match_operand:SI 3 "const_0_to_15_operand" "n,n,n,n")] + UNSPEC_ROUND)) + (match_operand:VF_128 1 "register_operand" "0,0,x,v") + (const_int 1)))] + "TARGET_SSE4_1" + "@ round\t{%3, %2, %0|%0, %2, %3} round\t{%3, %2, %0|%0, %2, %3} vround\t{%3, %2, %1, %0|%0, %1, %2, %3} diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 38ac851..c5900b2 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2019-10-26 Hongtao Liu + + PR target/89071 + * gcc.target/i386/pr87007-4.c: New test. + * gcc.target/i386/pr87007-5.c: Ditto. + 2019-10-25 Marek Polacek PR c++/91581 - ICE in exception-specification of defaulted ctor. diff --git a/gcc/testsuite/gcc.target/i386/pr87007-4.c b/gcc/testsuite/gcc.target/i386/pr87007-4.c new file mode 100644 index 0000000..e91bdcb --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr87007-4.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -march=skylake-avx512 -mfpmath=sse" } */ + + +#include + +extern double d1, d2, d3; +void +foo (int n, int k) +{ + for (int i = 0; i != n; i++) + if(i < k) + d1 = floor (d2); + else + d1 = ceil (d3); +} + +/* { dg-final { scan-assembler-times "vxorps\[^\n\r\]*xmm\[0-9\]" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr87007-5.c b/gcc/testsuite/gcc.target/i386/pr87007-5.c new file mode 100644 index 0000000..20d13cf --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr87007-5.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -march=skylake-avx512 -mfpmath=sse" } */ + + +#include + +extern double d1, d2, d3; +void +foo (int n, int k) +{ + for (int i = 0; i != n; i++) + if(i < k) + d1 = sqrt (d2); + else + d1 = sqrt (d3); +} + +/* { dg-final { scan-assembler-times "vxorps\[^\n\r\]*xmm\[0-9\]" 1 } } */ -- 2.7.4