From c55899f763b6d4510fd77711af3b605fd444e7fa Mon Sep 17 00:00:00 2001 From: Filipp Zhinkin Date: Fri, 5 Aug 2022 16:56:45 -0400 Subject: [PATCH] [DAGCombiner] Hoist funnel shifts from logic operation Hoist funnel shift from logic op: logic_op (FSH x0, x1, s), (FSH y0, y1, s) --> FSH (logic_op x0, y0), (logic_op x1, y1), s The transformation improves code generated for some cases related to issue https://github.com/llvm/llvm-project/issues/49541. Reduced amount of funnel shifts can also improve throughput on x86 CPUs by utilizing more available ports: https://quick-bench.com/q/gC7AKkJJsDZzRrs_JWDzm9t_iDM Transformation correctness checks: https://alive2.llvm.org/ce/z/TKPULH https://alive2.llvm.org/ce/z/UvTd_9 https://alive2.llvm.org/ce/z/j8qW3_ https://alive2.llvm.org/ce/z/7Wq7gE https://alive2.llvm.org/ce/z/Xr5w8R https://alive2.llvm.org/ce/z/D5xe_E https://alive2.llvm.org/ce/z/2yBZiy Differential Revision: https://reviews.llvm.org/D130994 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 15 +++++ llvm/test/CodeGen/X86/funnel-shift-logic-fold.ll | 67 +++++++++++----------- llvm/test/CodeGen/X86/icmp-shift-opt.ll | 71 ++++++++---------------- 3 files changed, 71 insertions(+), 82 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 1a37c45..6b46c28 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5326,6 +5326,21 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) { return DAG.getNode(HandOpcode, DL, VT, Logic); } + // For funnel shifts FSHL/FSHR: + // logic_op (OP x, x1, s), (OP y, y1, s) --> + // --> OP (logic_op x, y), (logic_op, x1, y1), s + if ((HandOpcode == ISD::FSHL || HandOpcode == ISD::FSHR) && + N0.getOperand(2) == N1.getOperand(2)) { + if (!N0.hasOneUse() || !N1.hasOneUse()) + return SDValue(); + SDValue X1 = N0.getOperand(1); + SDValue Y1 = N1.getOperand(1); + SDValue S = N0.getOperand(2); + SDValue Logic0 = DAG.getNode(LogicOpcode, DL, VT, X, Y); + SDValue Logic1 = DAG.getNode(LogicOpcode, DL, VT, X1, Y1); + return DAG.getNode(HandOpcode, DL, VT, Logic0, Logic1, S); + } + // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B)) // Only perform this optimization up until type legalization, before // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by diff --git a/llvm/test/CodeGen/X86/funnel-shift-logic-fold.ll b/llvm/test/CodeGen/X86/funnel-shift-logic-fold.ll index b20cca6..fb87583 100644 --- a/llvm/test/CodeGen/X86/funnel-shift-logic-fold.ll +++ b/llvm/test/CodeGen/X86/funnel-shift-logic-fold.ll @@ -7,12 +7,11 @@ declare i64 @llvm.fshr.i64(i64, i64, i64) nounwind readnone define i64 @hoist_fshl_from_or(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind { ; X64-LABEL: hoist_fshl_from_or: ; X64: # %bb.0: -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: orq %rcx, %rsi +; X64-NEXT: orq %rdx, %rax ; X64-NEXT: movl %r8d, %ecx -; X64-NEXT: shldq %cl, %rsi, %rdi -; X64-NEXT: shldq %cl, %rax, %rdx -; X64-NEXT: orq %rdi, %rdx -; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shldq %cl, %rsi, %rax ; X64-NEXT: retq %fshl.0 = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %s) %fshl.1 = call i64 @llvm.fshl.i64(i64 %c, i64 %d, i64 %s) @@ -23,12 +22,11 @@ define i64 @hoist_fshl_from_or(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind define i64 @hoist_fshl_from_and(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind { ; X64-LABEL: hoist_fshl_from_and: ; X64: # %bb.0: -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: andq %rcx, %rsi +; X64-NEXT: andq %rdx, %rax ; X64-NEXT: movl %r8d, %ecx -; X64-NEXT: shldq %cl, %rsi, %rdi -; X64-NEXT: shldq %cl, %rax, %rdx -; X64-NEXT: andq %rdi, %rdx -; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shldq %cl, %rsi, %rax ; X64-NEXT: retq %fshl.0 = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %s) %fshl.1 = call i64 @llvm.fshl.i64(i64 %c, i64 %d, i64 %s) @@ -39,12 +37,11 @@ define i64 @hoist_fshl_from_and(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind define i64 @hoist_fshl_from_xor(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind { ; X64-LABEL: hoist_fshl_from_xor: ; X64: # %bb.0: -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: xorq %rcx, %rsi +; X64-NEXT: xorq %rdx, %rax ; X64-NEXT: movl %r8d, %ecx -; X64-NEXT: shldq %cl, %rsi, %rdi -; X64-NEXT: shldq %cl, %rax, %rdx -; X64-NEXT: xorq %rdi, %rdx -; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shldq %cl, %rsi, %rax ; X64-NEXT: retq %fshl.0 = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %s) %fshl.1 = call i64 @llvm.fshl.i64(i64 %c, i64 %d, i64 %s) @@ -69,10 +66,10 @@ define i64 @fshl_or_with_different_shift_value(i64 %a, i64 %b, i64 %c, i64 %d) n define i64 @hoist_fshl_from_or_const_shift(i64 %a, i64 %b, i64 %c, i64 %d) nounwind { ; X64-LABEL: hoist_fshl_from_or_const_shift: ; X64: # %bb.0: -; X64-NEXT: movq %rdx, %rax -; X64-NEXT: shldq $15, %rsi, %rdi -; X64-NEXT: shldq $15, %rcx, %rax -; X64-NEXT: orq %rdi, %rax +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: orq %rcx, %rsi +; X64-NEXT: orq %rdx, %rax +; X64-NEXT: shldq $15, %rsi, %rax ; X64-NEXT: retq %fshl.0 = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 15) %fshl.1 = call i64 @llvm.fshl.i64(i64 %c, i64 %d, i64 15) @@ -83,11 +80,11 @@ define i64 @hoist_fshl_from_or_const_shift(i64 %a, i64 %b, i64 %c, i64 %d) nounw define i64 @hoist_fshr_from_or(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind { ; X64-LABEL: hoist_fshr_from_or: ; X64: # %bb.0: -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: orq %rdx, %rdi +; X64-NEXT: orq %rcx, %rax ; X64-NEXT: movl %r8d, %ecx -; X64-NEXT: shrdq %cl, %rdi, %rsi -; X64-NEXT: shrdq %cl, %rdx, %rax -; X64-NEXT: orq %rsi, %rax +; X64-NEXT: shrdq %cl, %rdi, %rax ; X64-NEXT: retq %fshr.0 = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %s) %fshr.1 = call i64 @llvm.fshr.i64(i64 %c, i64 %d, i64 %s) @@ -98,11 +95,11 @@ define i64 @hoist_fshr_from_or(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind define i64 @hoist_fshr_from_and(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind { ; X64-LABEL: hoist_fshr_from_and: ; X64: # %bb.0: -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: andq %rdx, %rdi +; X64-NEXT: andq %rcx, %rax ; X64-NEXT: movl %r8d, %ecx -; X64-NEXT: shrdq %cl, %rdi, %rsi -; X64-NEXT: shrdq %cl, %rdx, %rax -; X64-NEXT: andq %rsi, %rax +; X64-NEXT: shrdq %cl, %rdi, %rax ; X64-NEXT: retq %fshr.0 = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %s) %fshr.1 = call i64 @llvm.fshr.i64(i64 %c, i64 %d, i64 %s) @@ -113,11 +110,11 @@ define i64 @hoist_fshr_from_and(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind define i64 @hoist_fshr_from_xor(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind { ; X64-LABEL: hoist_fshr_from_xor: ; X64: # %bb.0: -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: xorq %rdx, %rdi +; X64-NEXT: xorq %rcx, %rax ; X64-NEXT: movl %r8d, %ecx -; X64-NEXT: shrdq %cl, %rdi, %rsi -; X64-NEXT: shrdq %cl, %rdx, %rax -; X64-NEXT: xorq %rsi, %rax +; X64-NEXT: shrdq %cl, %rdi, %rax ; X64-NEXT: retq %fshr.0 = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %s) %fshr.1 = call i64 @llvm.fshr.i64(i64 %c, i64 %d, i64 %s) @@ -142,10 +139,10 @@ define i64 @fshr_or_with_different_shift_value(i64 %a, i64 %b, i64 %c, i64 %d) n define i64 @hoist_fshr_from_or_const_shift(i64 %a, i64 %b, i64 %c, i64 %d) nounwind { ; X64-LABEL: hoist_fshr_from_or_const_shift: ; X64: # %bb.0: -; X64-NEXT: movq %rdx, %rax -; X64-NEXT: shldq $49, %rsi, %rdi -; X64-NEXT: shldq $49, %rcx, %rax -; X64-NEXT: orq %rdi, %rax +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: orq %rcx, %rsi +; X64-NEXT: orl %edx, %eax +; X64-NEXT: shldq $49, %rsi, %rax ; X64-NEXT: retq %fshr.0 = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 15) %fshr.1 = call i64 @llvm.fshr.i64(i64 %c, i64 %d, i64 15) diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll index 23524d2..582fae8 100644 --- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll @@ -25,12 +25,11 @@ define i128 @opt_setcc_lt_power_of_2(i128 %a) nounwind { ; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %edx ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: shldl $4, %edx, %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: orl %ecx, %ebx ; X86-NEXT: movl %esi, %ebp -; X86-NEXT: orl %ecx, %ebp -; X86-NEXT: shrdl $28, %edx, %ebp ; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: shrdl $28, %ebx, %ebp ; X86-NEXT: jne .LBB0_1 ; X86-NEXT: # %bb.2: # %exit ; X86-NEXT: movl %edi, (%eax) @@ -73,19 +72,15 @@ exit: define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_srl_eq_zero: ; X86: # %bb.0: -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: shrdl $17, %ecx, %eax -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: shldl $15, %edx, %esi -; X86-NEXT: orl %esi, %eax -; X86-NEXT: shrdl $17, %edx, %ecx -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: shldl $15, %edx, %eax ; X86-NEXT: sete %al -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_srl_eq_zero: @@ -102,19 +97,15 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind { define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_srl_ne_zero: ; X86: # %bb.0: -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: shrdl $17, %ecx, %eax -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: shldl $15, %edx, %esi -; X86-NEXT: orl %esi, %eax -; X86-NEXT: shrdl $17, %edx, %ecx -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: shldl $15, %edx, %eax ; X86-NEXT: setne %al -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_srl_ne_zero: @@ -131,19 +122,13 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind { define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_shl_eq_zero: ; X86: # %bb.0: -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: shldl $17, %edx, %esi -; X86-NEXT: orl %eax, %edx -; X86-NEXT: shldl $17, %ecx, %edx -; X86-NEXT: shldl $17, %eax, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: orl %ecx, %edx +; X86-NEXT: shll $17, %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: sete %al -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_shl_eq_zero: @@ -160,19 +145,13 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind { define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_shl_ne_zero: ; X86: # %bb.0: -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: shldl $17, %edx, %esi -; X86-NEXT: orl %eax, %edx -; X86-NEXT: shldl $17, %ecx, %edx -; X86-NEXT: shldl $17, %eax, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: orl %ecx, %edx +; X86-NEXT: shll $17, %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: setne %al -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_shl_ne_zero: @@ -243,13 +222,11 @@ define i1 @opt_setcc_expanded_shl_correct_shifts(i64 %a, i64 %b) nounwind { ; X86-LABEL: opt_setcc_expanded_shl_correct_shifts: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: shldl $17, %ecx, %edx +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: shldl $17, %eax, %ecx -; X86-NEXT: orl %edx, %ecx ; X86-NEXT: sete %al ; X86-NEXT: retl ; -- 2.7.4