From c55899f763b6d4510fd77711af3b605fd444e7fa Mon Sep 17 00:00:00 2001
From: Filipp Zhinkin <filipp.zhinkin@gmail.com>
Date: Fri, 5 Aug 2022 16:56:45 -0400
Subject: [PATCH] [DAGCombiner] Hoist funnel shifts from logic operation

Hoist funnel shift from logic op:
logic_op (FSH x0, x1, s), (FSH y0, y1, s) --> FSH (logic_op x0, y0), (logic_op x1, y1), s

The transformation improves code generated for some cases related to
issue https://github.com/llvm/llvm-project/issues/49541.

Reduced amount of funnel shifts can also improve throughput on x86 CPUs by utilizing more
available ports: https://quick-bench.com/q/gC7AKkJJsDZzRrs_JWDzm9t_iDM

Transformation correctness checks:
https://alive2.llvm.org/ce/z/TKPULH
https://alive2.llvm.org/ce/z/UvTd_9
https://alive2.llvm.org/ce/z/j8qW3_
https://alive2.llvm.org/ce/z/7Wq7gE
https://alive2.llvm.org/ce/z/Xr5w8R
https://alive2.llvm.org/ce/z/D5xe_E
https://alive2.llvm.org/ce/z/2yBZiy

Differential Revision: https://reviews.llvm.org/D130994
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp    | 15 +++++
 llvm/test/CodeGen/X86/funnel-shift-logic-fold.ll | 67 +++++++++++-----------
 llvm/test/CodeGen/X86/icmp-shift-opt.ll          | 71 ++++++++----------------
 3 files changed, 71 insertions(+), 82 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1a37c45..6b46c28 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5326,6 +5326,21 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
     return DAG.getNode(HandOpcode, DL, VT, Logic);
   }
 
+  // For funnel shifts FSHL/FSHR:
+  // logic_op (OP x, x1, s), (OP y, y1, s) -->
+  // --> OP (logic_op x, y), (logic_op, x1, y1), s
+  if ((HandOpcode == ISD::FSHL || HandOpcode == ISD::FSHR) &&
+      N0.getOperand(2) == N1.getOperand(2)) {
+    if (!N0.hasOneUse() || !N1.hasOneUse())
+      return SDValue();
+    SDValue X1 = N0.getOperand(1);
+    SDValue Y1 = N1.getOperand(1);
+    SDValue S = N0.getOperand(2);
+    SDValue Logic0 = DAG.getNode(LogicOpcode, DL, VT, X, Y);
+    SDValue Logic1 = DAG.getNode(LogicOpcode, DL, VT, X1, Y1);
+    return DAG.getNode(HandOpcode, DL, VT, Logic0, Logic1, S);
+  }
+
   // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
   // Only perform this optimization up until type legalization, before
   // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
diff --git a/llvm/test/CodeGen/X86/funnel-shift-logic-fold.ll b/llvm/test/CodeGen/X86/funnel-shift-logic-fold.ll
index b20cca6..fb87583 100644
--- a/llvm/test/CodeGen/X86/funnel-shift-logic-fold.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift-logic-fold.ll
@@ -7,12 +7,11 @@ declare i64 @llvm.fshr.i64(i64, i64, i64) nounwind readnone
 define i64 @hoist_fshl_from_or(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind {
 ; X64-LABEL: hoist_fshl_from_or:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    orq %rcx, %rsi
+; X64-NEXT:    orq %rdx, %rax
 ; X64-NEXT:    movl %r8d, %ecx
-; X64-NEXT:    shldq %cl, %rsi, %rdi
-; X64-NEXT:    shldq %cl, %rax, %rdx
-; X64-NEXT:    orq %rdi, %rdx
-; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    shldq %cl, %rsi, %rax
 ; X64-NEXT:    retq
   %fshl.0 = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %s)
   %fshl.1 = call i64 @llvm.fshl.i64(i64 %c, i64 %d, i64 %s)
@@ -23,12 +22,11 @@ define i64 @hoist_fshl_from_or(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind
 define i64 @hoist_fshl_from_and(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind {
 ; X64-LABEL: hoist_fshl_from_and:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    andq %rcx, %rsi
+; X64-NEXT:    andq %rdx, %rax
 ; X64-NEXT:    movl %r8d, %ecx
-; X64-NEXT:    shldq %cl, %rsi, %rdi
-; X64-NEXT:    shldq %cl, %rax, %rdx
-; X64-NEXT:    andq %rdi, %rdx
-; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    shldq %cl, %rsi, %rax
 ; X64-NEXT:    retq
   %fshl.0 = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %s)
   %fshl.1 = call i64 @llvm.fshl.i64(i64 %c, i64 %d, i64 %s)
@@ -39,12 +37,11 @@ define i64 @hoist_fshl_from_and(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind
 define i64 @hoist_fshl_from_xor(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind {
 ; X64-LABEL: hoist_fshl_from_xor:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    xorq %rcx, %rsi
+; X64-NEXT:    xorq %rdx, %rax
 ; X64-NEXT:    movl %r8d, %ecx
-; X64-NEXT:    shldq %cl, %rsi, %rdi
-; X64-NEXT:    shldq %cl, %rax, %rdx
-; X64-NEXT:    xorq %rdi, %rdx
-; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    shldq %cl, %rsi, %rax
 ; X64-NEXT:    retq
   %fshl.0 = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %s)
   %fshl.1 = call i64 @llvm.fshl.i64(i64 %c, i64 %d, i64 %s)
@@ -69,10 +66,10 @@ define i64 @fshl_or_with_different_shift_value(i64 %a, i64 %b, i64 %c, i64 %d) n
 define i64 @hoist_fshl_from_or_const_shift(i64 %a, i64 %b, i64 %c, i64 %d) nounwind {
 ; X64-LABEL: hoist_fshl_from_or_const_shift:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    shldq $15, %rsi, %rdi
-; X64-NEXT:    shldq $15, %rcx, %rax
-; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    orq %rcx, %rsi
+; X64-NEXT:    orq %rdx, %rax
+; X64-NEXT:    shldq $15, %rsi, %rax
 ; X64-NEXT:    retq
   %fshl.0 = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 15)
   %fshl.1 = call i64 @llvm.fshl.i64(i64 %c, i64 %d, i64 15)
@@ -83,11 +80,11 @@ define i64 @hoist_fshl_from_or_const_shift(i64 %a, i64 %b, i64 %c, i64 %d) nounw
 define i64 @hoist_fshr_from_or(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind {
 ; X64-LABEL: hoist_fshr_from_or:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    orq %rdx, %rdi
+; X64-NEXT:    orq %rcx, %rax
 ; X64-NEXT:    movl %r8d, %ecx
-; X64-NEXT:    shrdq %cl, %rdi, %rsi
-; X64-NEXT:    shrdq %cl, %rdx, %rax
-; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    shrdq %cl, %rdi, %rax
 ; X64-NEXT:    retq
   %fshr.0 = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %s)
   %fshr.1 = call i64 @llvm.fshr.i64(i64 %c, i64 %d, i64 %s)
@@ -98,11 +95,11 @@ define i64 @hoist_fshr_from_or(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind
 define i64 @hoist_fshr_from_and(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind {
 ; X64-LABEL: hoist_fshr_from_and:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    andq %rdx, %rdi
+; X64-NEXT:    andq %rcx, %rax
 ; X64-NEXT:    movl %r8d, %ecx
-; X64-NEXT:    shrdq %cl, %rdi, %rsi
-; X64-NEXT:    shrdq %cl, %rdx, %rax
-; X64-NEXT:    andq %rsi, %rax
+; X64-NEXT:    shrdq %cl, %rdi, %rax
 ; X64-NEXT:    retq
   %fshr.0 = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %s)
   %fshr.1 = call i64 @llvm.fshr.i64(i64 %c, i64 %d, i64 %s)
@@ -113,11 +110,11 @@ define i64 @hoist_fshr_from_and(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind
 define i64 @hoist_fshr_from_xor(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind {
 ; X64-LABEL: hoist_fshr_from_xor:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    xorq %rdx, %rdi
+; X64-NEXT:    xorq %rcx, %rax
 ; X64-NEXT:    movl %r8d, %ecx
-; X64-NEXT:    shrdq %cl, %rdi, %rsi
-; X64-NEXT:    shrdq %cl, %rdx, %rax
-; X64-NEXT:    xorq %rsi, %rax
+; X64-NEXT:    shrdq %cl, %rdi, %rax
 ; X64-NEXT:    retq
   %fshr.0 = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %s)
   %fshr.1 = call i64 @llvm.fshr.i64(i64 %c, i64 %d, i64 %s)
@@ -142,10 +139,10 @@ define i64 @fshr_or_with_different_shift_value(i64 %a, i64 %b, i64 %c, i64 %d) n
 define i64 @hoist_fshr_from_or_const_shift(i64 %a, i64 %b, i64 %c, i64 %d) nounwind {
 ; X64-LABEL: hoist_fshr_from_or_const_shift:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    shldq $49, %rsi, %rdi
-; X64-NEXT:    shldq $49, %rcx, %rax
-; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    orq %rcx, %rsi
+; X64-NEXT:    orl %edx, %eax
+; X64-NEXT:    shldq $49, %rsi, %rax
 ; X64-NEXT:    retq
   %fshr.0 = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 15)
   %fshr.1 = call i64 @llvm.fshr.i64(i64 %c, i64 %d, i64 15)
diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
index 23524d2..582fae8 100644
--- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll
+++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
@@ -25,12 +25,11 @@ define i128 @opt_setcc_lt_power_of_2(i128 %a) nounwind {
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    shldl $4, %edx, %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    orl %ecx, %ebx
 ; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    orl %ecx, %ebp
-; X86-NEXT:    shrdl $28, %edx, %ebp
 ; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    shrdl $28, %ebx, %ebp
 ; X86-NEXT:    jne .LBB0_1
 ; X86-NEXT:  # %bb.2: # %exit
 ; X86-NEXT:    movl %edi, (%eax)
@@ -73,19 +72,15 @@ exit:
 define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_srl_eq_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    shrdl $17, %ecx, %eax
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    shldl $15, %edx, %esi
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    shrdl $17, %edx, %ecx
-; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    shldl $15, %edx, %eax
 ; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_srl_eq_zero:
@@ -102,19 +97,15 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
 define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_srl_ne_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    shrdl $17, %ecx, %eax
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    shldl $15, %edx, %esi
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    shrdl $17, %edx, %ecx
-; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    shldl $15, %edx, %eax
 ; X86-NEXT:    setne %al
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_srl_ne_zero:
@@ -131,19 +122,13 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
 define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_eq_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    shldl $17, %edx, %esi
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    shldl $17, %ecx, %edx
-; X86-NEXT:    shldl $17, %eax, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    shll $17, %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_eq_zero:
@@ -160,19 +145,13 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
 define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_ne_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    shldl $17, %edx, %esi
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    shldl $17, %ecx, %edx
-; X86-NEXT:    shldl $17, %eax, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    shll $17, %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    setne %al
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_ne_zero:
@@ -243,13 +222,11 @@ define i1 @opt_setcc_expanded_shl_correct_shifts(i64 %a, i64 %b) nounwind {
 ; X86-LABEL: opt_setcc_expanded_shl_correct_shifts:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    shldl $17, %ecx, %edx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    shldl $17, %eax, %ecx
-; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
-- 
2.7.4