From 5f78ba001ca23ab826b9be823fc8ac0a0e5d2237 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Tue, 11 May 2021 19:35:41 +0300
Subject: [PATCH] [X86][Codegen] Shift amount mod: sh? i64 x, (32-y) --> sh?
 i64 x, -(y+32)

I've seen this in the RawSpeed's BitPumpMSB*::push() hotpath,
after fixing the buffer abstraction to a more sane one,
when looking into a +5% runtime regression.
I was hoping that this would fix it, but it does not look it does.

This seems to be at least not worse than the original pattern.
But i'm actually mainly interested in the case where we already
compute `(y+32)` (see last test),

https://alive2.llvm.org/ce/z/ZCzJio

Reviewed By: spatel

Differential Revision: https://reviews.llvm.org/D101944
---
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp            | 23 ++++++++++---
 .../test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll | 38 ++++++++++++----------
 2 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 54884d8..f3b4b8e 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -3854,14 +3854,29 @@ bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
       // If we are shifting by N-X where N == 0 mod Size, then just shift by -X
       // to generate a NEG instead of a SUB of a constant.
     } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
-               Add0C->getAPIntValue() != 0 &&
-               Add0C->getAPIntValue().urem(Size) == 0) {
+               Add0C->getZExtValue() != 0) {
+      EVT SubVT = ShiftAmt.getValueType();
+      SDValue X;
+      if (Add0C->getZExtValue() % Size == 0)
+        X = Add1;
+      else if (ShiftAmt.hasOneUse() && Size == 64 &&
+               Add0C->getZExtValue() % 32 == 0) {
+        // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
+        // This is mainly beneficial if we already compute (x+n*32).
+        if (Add1.getOpcode() == ISD::TRUNCATE) {
+          Add1 = Add1.getOperand(0);
+          SubVT = Add1.getValueType();
+        }
+        X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1,
+                            CurDAG->getZExtOrTrunc(Add0, DL, SubVT));
+        insertDAGNode(*CurDAG, OrigShiftAmt, X);
+      } else
+        return false;
       // Insert a negate op.
       // TODO: This isn't guaranteed to replace the sub if there is a logic cone
       // that uses it that's not a shift.
-      EVT SubVT = ShiftAmt.getValueType();
       SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
-      SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1);
+      SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
       NewShiftAmt = Neg;
 
       // Insert these operands into a valid topological order so they can
diff --git a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
index a69efc6..a8e433e 100644
--- a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
+++ b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
@@ -8,16 +8,17 @@ define i64 @t0(i64 %val, i64 %shamt) nounwind {
 ; X64-NOBMI2-LABEL: t0:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movq %rdi, %rax
-; X64-NOBMI2-NEXT:    movb $32, %cl
-; X64-NOBMI2-NEXT:    subb %sil, %cl
+; X64-NOBMI2-NEXT:    leaq 32(%rsi), %rcx
+; X64-NOBMI2-NEXT:    negq %rcx
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI2-NEXT:    shlq %cl, %rax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: t0:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movb $32, %al
-; X64-BMI2-NEXT:    subb %sil, %al
-; X64-BMI2-NEXT:    shlxq %rax, %rdi, %rax
+; X64-BMI2-NEXT:    addq $32, %rsi
+; X64-BMI2-NEXT:    negq %rsi
+; X64-BMI2-NEXT:    shlxq %rsi, %rdi, %rax
 ; X64-BMI2-NEXT:    retq
 ;
 ; X32-NOBMI2-LABEL: t0:
@@ -228,16 +229,17 @@ define i64 @t4(i64 %val, i64 %shamt) nounwind {
 ; X64-NOBMI2-LABEL: t4:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movq %rdi, %rax
-; X64-NOBMI2-NEXT:    movb $96, %cl
-; X64-NOBMI2-NEXT:    subb %sil, %cl
+; X64-NOBMI2-NEXT:    leaq 96(%rsi), %rcx
+; X64-NOBMI2-NEXT:    negq %rcx
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI2-NEXT:    shlq %cl, %rax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: t4:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movb $96, %al
-; X64-BMI2-NEXT:    subb %sil, %al
-; X64-BMI2-NEXT:    shlxq %rax, %rdi, %rax
+; X64-BMI2-NEXT:    addq $96, %rsi
+; X64-BMI2-NEXT:    negq %rsi
+; X64-BMI2-NEXT:    shlxq %rsi, %rdi, %rax
 ; X64-BMI2-NEXT:    retq
 ;
 ; X32-NOBMI2-LABEL: t4:
@@ -282,21 +284,21 @@ define i64 @t4(i64 %val, i64 %shamt) nounwind {
 define i64 @t5_cse(i64 %val, i64 %shamt, i64*%dst) nounwind {
 ; X64-NOBMI2-LABEL: t5_cse:
 ; X64-NOBMI2:       # %bb.0:
+; X64-NOBMI2-NEXT:    movq %rsi, %rcx
 ; X64-NOBMI2-NEXT:    movq %rdi, %rax
-; X64-NOBMI2-NEXT:    leaq 32(%rsi), %rcx
+; X64-NOBMI2-NEXT:    addq $32, %rcx
 ; X64-NOBMI2-NEXT:    movq %rcx, (%rdx)
-; X64-NOBMI2-NEXT:    movb $32, %cl
-; X64-NOBMI2-NEXT:    subb %sil, %cl
+; X64-NOBMI2-NEXT:    negq %rcx
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI2-NEXT:    shlq %cl, %rax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: t5_cse:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    leaq 32(%rsi), %rax
-; X64-BMI2-NEXT:    movq %rax, (%rdx)
-; X64-BMI2-NEXT:    movb $32, %al
-; X64-BMI2-NEXT:    subb %sil, %al
-; X64-BMI2-NEXT:    shlxq %rax, %rdi, %rax
+; X64-BMI2-NEXT:    addq $32, %rsi
+; X64-BMI2-NEXT:    movq %rsi, (%rdx)
+; X64-BMI2-NEXT:    negq %rsi
+; X64-BMI2-NEXT:    shlxq %rsi, %rdi, %rax
 ; X64-BMI2-NEXT:    retq
 ;
 ; X32-NOBMI2-LABEL: t5_cse:
-- 
2.7.4