From 00193676a5a3e7e50e1fa6646bb5abb5a7b2acbb Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Thu, 30 Jun 2022 11:00:03 +0100 Subject: [PATCH] Use xchg for DImode double word rotate by 32 bits with -m32 on x86. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This patch was motivated by the investigation of Linus Torvalds' spill heavy cryptography kernels in PR 105930. The di3 expander handles all rotations by an immediate constant for 1..63 bits with the exception of 32 bits, which FAILs and is then split by the middle-end. This patch makes these 32-bit doubleword rotations consistent with the other DImode rotations during reload, which results in reduced register pressure, fewer instructions and the use of x86's xchg instruction when appropriate. In theory, xchg can be handled by register renaming, but even on micro-architectures where it's implemented by 3 uops (no worse than a three instruction shuffle), avoiding nominating a "temporary" register, reduces user-visible register pressure (and has obvious code size benefits). The effects are best shown with the new testcase: unsigned long long bar(); unsigned long long foo() { unsigned long long x = bar(); return (x>>32) | (x<<32); } for which GCC with -m32 -O2 currently generates: subl $12, %esp call bar addl $12, %esp movl %eax, %ecx movl %edx, %eax movl %ecx, %edx ret but with this patch now generates: subl $12, %esp call bar addl $12, %esp xchgl %edx, %eax ret With this patch, the number of lines of assembly language generated for the blake2b kernel (from the attachment to PR105930) decreases from 5626 to 5404. Although there's an impressive reduction in instruction count, there's no change/reduction in stack frame size. 2022-06-30 Roger Sayle Uroš Bizjak gcc/ChangeLog * config/i386/i386.md (swap_mode): Rename from *swap to provide gen_swapsi. (di3): Handle !TARGET_64BIT rotations by 32 bits via new gen_32di2_doubleword below. (32di2_doubleword): New define_insn_and_split that splits after reload as either a pair of move instructions or an xchgl (using gen_swapsi). gcc/testsuite/ChangeLog * gcc.target/i386/xchg-3.c: New test case. --- gcc/config/i386/i386.md | 22 +++++++++++++++++++++- gcc/testsuite/gcc.target/i386/xchg-3.c | 12 ++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/i386/xchg-3.c diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 125a3b4..04cd2bc 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -2966,7 +2966,7 @@ (set_attr "memory" "load") (set_attr "mode" "")]) -(define_insn "*swap" +(define_insn "swap" [(set (match_operand:SWI48 0 "register_operand" "+r") (match_operand:SWI48 1 "register_operand" "+r")) (set (match_dup 1) @@ -13673,6 +13673,8 @@ else if (const_1_to_31_operand (operands[2], VOIDmode)) emit_insn (gen_ix86_di3_doubleword (operands[0], operands[1], operands[2])); + else if (CONST_INT_P (operands[2]) && INTVAL (operands[2]) == 32) + emit_insn (gen_32di2_doubleword (operands[0], operands[1])); else FAIL; @@ -13845,6 +13847,24 @@ split_double_mode (mode, &operands[0], 1, &operands[4], &operands[5]); }) +(define_insn_and_split "32di2_doubleword" + [(set (match_operand:DI 0 "register_operand" "=r,r,r") + (any_rotate:DI (match_operand:DI 1 "nonimmediate_operand" "0,r,o") + (const_int 32)))] + "!TARGET_64BIT" + "#" + "&& reload_completed" + [(set (match_dup 0) (match_dup 3)) + (set (match_dup 2) (match_dup 1))] +{ + split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]); + if (rtx_equal_p (operands[0], operands[1])) + { + emit_insn (gen_swapsi (operands[0], operands[2])); + DONE; + } +}) + (define_mode_attr rorx_immediate_operand [(SI "const_0_to_31_operand") (DI "const_0_to_63_operand")]) diff --git a/gcc/testsuite/gcc.target/i386/xchg-3.c b/gcc/testsuite/gcc.target/i386/xchg-3.c new file mode 100644 index 0000000..eec05f0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/xchg-3.c @@ -0,0 +1,12 @@ +/* { dg-do compile { target ia32 } } */ +/* { dg-options "-O2" } */ + +unsigned long long bar(); + +unsigned long long foo() +{ + unsigned long long x = bar(); + return (x>>32) | (x<<32); +} + +/*{ dg-final { scan-assembler "xchgl" } } */ -- 2.7.4