From 2a83259f837e5cbd39467a3faf954b51d9d13664 Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Tue, 2 Nov 2021 21:58:32 +0000 Subject: [PATCH] x86_64: Improved implementation of TImode rotations. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This simple patch improves the implementation of 128-bit (TImode) rotations on x86_64 (a missed optimization opportunity spotted during the recent V1TImode improvements). Currently, the function: unsigned __int128 rotrti3(unsigned __int128 x, unsigned int i) { return (x >> i) | (x << (128-i)); } produces: rotrti3: movq %rsi, %r8 movq %rdi, %r9 movl %edx, %ecx movq %rdi, %rsi movq %r9, %rax movq %r8, %rdx movq %r8, %rdi shrdq %r8, %rax shrq %cl, %rdx xorl %r8d, %r8d testb $64, %cl cmovne %rdx, %rax cmovne %r8, %rdx negl %ecx andl $127, %ecx shldq %r9, %rdi salq %cl, %rsi xorl %r9d, %r9d testb $64, %cl cmovne %rsi, %rdi cmovne %r9, %rsi orq %rdi, %rdx orq %rsi, %rax ret with this patch, GCC will now generate the much nicer: rotrti3: movl %edx, %ecx movq %rdi, %rdx shrdq %rsi, %rdx shrdq %rdi, %rsi andl $64, %ecx movq %rdx, %rax cmove %rsi, %rdx cmovne %rsi, %rax ret Even I wasn't expecting the optimizer's choice of the final three instructions; a thing of beauty. For rotations larger than 64, the lowpart and the highpart (%rax and %rdx) are transposed, and it would be nice to have a conditional swap/exchange. The inspired solution the compiler comes up with is to store/duplicate the same value in both %rax/%rdx, and then use complementary conditional moves to either update the lowpart or highpart, which cleverly avoids the potential decode-stage pipeline stall (on some microarchitectures) from having multiple instructions conditional on the same condition. See X86_TUNE_ONE_IF_CONV_INSN, and notice there are two such stalls in the original expansion of rot[rl]ti3. 2021-11-02 Roger Sayle Uroš Bizjak * config/i386/i386.md (ti3): Provide expansion for rotations by non-constant amounts. --- gcc/config/i386/i386.md | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 11535df..6eb9de8 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -12589,8 +12589,24 @@ emit_insn (gen_ix86_ti3_doubleword (operands[0], operands[1], operands[2])); else - FAIL; - + { + rtx amount = force_reg (QImode, operands[2]); + rtx src_lo = gen_lowpart (DImode, operands[1]); + rtx src_hi = gen_highpart (DImode, operands[1]); + rtx tmp_lo = gen_reg_rtx (DImode); + rtx tmp_hi = gen_reg_rtx (DImode); + emit_move_insn (tmp_lo, src_lo); + emit_move_insn (tmp_hi, src_hi); + rtx (*shiftd) (rtx, rtx, rtx) + = ( == ROTATE) ? gen_x86_64_shld : gen_x86_64_shrd; + emit_insn (shiftd (tmp_lo, src_hi, amount)); + emit_insn (shiftd (tmp_hi, src_lo, amount)); + rtx dst_lo = gen_lowpart (DImode, operands[0]); + rtx dst_hi = gen_highpart (DImode, operands[0]); + emit_move_insn (dst_lo, tmp_lo); + emit_move_insn (dst_hi, tmp_hi); + emit_insn (gen_x86_shiftdi_adj_1 (dst_lo, dst_hi, amount, tmp_lo)); + } DONE; }) -- 2.7.4