From 3b8794302b52a819ca3ea78238e9b5025d1c56dd Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Fri, 24 Jun 2022 07:15:08 +0100 Subject: [PATCH] PR target/105930: Split *xordi3_doubleword after reload on x86. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This patch addresses PR target/105930 which is an ia32 stack frame size regression in high-register pressure XOR-rich cryptography functions reported by Linus Torvalds. The underlying problem is once the limited number of registers on the x86 are exhausted, the register allocator has to decide which to spill, where some eviction choices lead to much poorer code, but these consequences are difficult to predict in advance. The patch below, which splits xordi3_doubleword and iordi3_doubleword after reload (instead of before), significantly reduces the amount of spill code and stack frame size, in what might appear to be an arbitrary choice. My explanation of this behaviour is that the mixing of pre-reload split SImode instructions and post-reload split DImode instructions is confusing some of the heuristics used by reload. One might think that splitting early gives the register allocator more freedom to use available registers, but in practice the constraint that double word values occupy consecutive registers (when ultimately used as a DImode value) is the greater constraint. Instead, I believe in this case, the pseudo registers used in memory addressing, appear to be double counted for split SImode instructions compared to unsplit DImode instructions. For the reduced test case in comment #13, this leads to %eax being used to hold the long-lived argument pointer "v", blocking the use of the ax:dx pair for processing double word values. The important lines are at the very top of the assembly output: GCC 11 [use %ecx to address memory, require a 24-byte stack frame] sub esp, 24 mov ecx, DWORD PTR [esp+40] GCC 12 [use %eax to address memory, require a 44-byte stack frame] sub esp, 44 mov eax, DWORD PTR [esp+64] 2022-06-24 Roger Sayle Uroš Bizjak gcc/ChangeLog PR target/105930 * config/i386/i386.md (*di3_doubleword): Split after reload. Use rtx_equal_p to avoid creating memory-to-memory moves, and emit NOTE_INSN_DELETED if operand[2] is zero (i.e. with -O0). --- gcc/config/i386/i386.md | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 3093cb5..dd173f7 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -10539,22 +10539,30 @@ "ix86_expand_binary_operator (, mode, operands); DONE;") (define_insn_and_split "*di3_doubleword" - [(set (match_operand:DI 0 "nonimmediate_operand") + [(set (match_operand:DI 0 "nonimmediate_operand" "=ro,r") (any_or:DI - (match_operand:DI 1 "nonimmediate_operand") - (match_operand:DI 2 "x86_64_szext_general_operand"))) + (match_operand:DI 1 "nonimmediate_operand" "0,0") + (match_operand:DI 2 "x86_64_szext_general_operand" "re,o"))) (clobber (reg:CC FLAGS_REG))] "!TARGET_64BIT - && ix86_binary_operator_ok (, DImode, operands) - && ix86_pre_reload_split ()" + && ix86_binary_operator_ok (, DImode, operands)" "#" - "&& 1" + "&& reload_completed" [(const_int 0)] { + /* This insn may disappear completely when operands[2] == const0_rtx + and operands[0] == operands[1], which requires a NOTE_INSN_DELETED. */ + bool emit_insn_deleted_note_p = false; + split_double_mode (DImode, &operands[0], 3, &operands[0], &operands[3]); if (operands[2] == const0_rtx) - emit_move_insn (operands[0], operands[1]); + { + if (!rtx_equal_p (operands[0], operands[1])) + emit_move_insn (operands[0], operands[1]); + else + emit_insn_deleted_note_p = true; + } else if (operands[2] == constm1_rtx) { if ( == IOR) @@ -10566,7 +10574,12 @@ ix86_expand_binary_operator (, SImode, &operands[0]); if (operands[5] == const0_rtx) - emit_move_insn (operands[3], operands[4]); + { + if (!rtx_equal_p (operands[3], operands[4])) + emit_move_insn (operands[3], operands[4]); + else if (emit_insn_deleted_note_p) + emit_note (NOTE_INSN_DELETED); + } else if (operands[5] == constm1_rtx) { if ( == IOR) -- 2.7.4