From 2f0c7fd2dbd06ae5f25b0c72b2b8f2a1c5baeb72 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 13 Dec 2019 09:40:33 -0500
Subject: [PATCH] [DAGCombiner] fold shift-trunc-shift to shift-mask-trunc (2nd
 try)

The initial attempt (rG89633320) botched the logic by reversing
the source/dest types. Added x86 tests for additional coverage.
The vector tests show a potential improvement (fold vector load
instead of broadcasting), but that's a known/existing problem.

This fold is done in IR by instcombine, and we have a special
form of it already here in DAGCombiner, but we want the more
general transform too:
https://rise4fun.com/Alive/3jZm

Name: general
Pre: (C1 + zext(C2) < 64)
%s = lshr i64 %x, C1
%t = trunc i64 %s to i16
%r = lshr i16 %t, C2
=>
%s2 = lshr i64 %x, C1 + zext(C2)
%a = and i64 %s2, zext((1 << (16 - C2)) - 1)
%r = trunc %a to i16

Name: special
Pre: C1 == 48
%s = lshr i64 %x, C1
%t = trunc i64 %s to i16
%r = lshr i16 %t, C2
=>
%s2 = lshr i64 %x, C1 + zext(C2)
%r = trunc %s2 to i16

...because D58017 exposes a regression without this fold.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp  | 14 +++++++++
 llvm/test/CodeGen/AArch64/shift-amount-mod.ll  |  3 +-
 llvm/test/CodeGen/PowerPC/trunc-srl-load.ll    |  3 +-
 llvm/test/CodeGen/X86/shift-amount-mod.ll      |  8 ++---
 llvm/test/CodeGen/X86/vector-shift-lshr-256.ll | 42 +++++++++++++++-----------
 5 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 15c2be1..95127d7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7943,6 +7943,20 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
                                        InnerShift.getOperand(0), NewShiftAmt);
         return DAG.getNode(ISD::TRUNCATE, DL, VT, NewShift);
       }
+      // In the more general case, we can clear the high bits after the shift:
+      // srl (trunc (srl x, c1)), c2 --> trunc (and (srl x, (c1+c2)), Mask)
+      if (N0.hasOneUse() && InnerShift.hasOneUse() &&
+          c1 + c2 < InnerShiftSize) {
+        SDLoc DL(N);
+        SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT);
+        SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT,
+                                       InnerShift.getOperand(0), NewShiftAmt);
+        SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(InnerShiftSize,
+                                                            OpSizeInBits - c2),
+                                       DL, InnerShiftVT);
+        SDValue And = DAG.getNode(ISD::AND, DL, InnerShiftVT, NewShift, Mask);
+        return DAG.getNode(ISD::TRUNCATE, DL, VT, And);
+      }
     }
   }
 
diff --git a/llvm/test/CodeGen/AArch64/shift-amount-mod.ll b/llvm/test/CodeGen/AArch64/shift-amount-mod.ll
index 4f6051e..4038390 100644
--- a/llvm/test/CodeGen/AArch64/shift-amount-mod.ll
+++ b/llvm/test/CodeGen/AArch64/shift-amount-mod.ll
@@ -670,8 +670,7 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b
 define i32 @t(i64 %x) {
 ; CHECK-LABEL: t:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #13
-; CHECK-NEXT:    ubfx x0, x8, #4, #28
+; CHECK-NEXT:    ubfx x0, x0, #17, #28
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
   %s = lshr i64 %x, 13
diff --git a/llvm/test/CodeGen/PowerPC/trunc-srl-load.ll b/llvm/test/CodeGen/PowerPC/trunc-srl-load.ll
index a1af256..5dc0534 100644
--- a/llvm/test/CodeGen/PowerPC/trunc-srl-load.ll
+++ b/llvm/test/CodeGen/PowerPC/trunc-srl-load.ll
@@ -25,8 +25,7 @@ cond.false:                                       ; preds = %entry
 define i32 @sh_trunc_sh(i64 %x) {
 ; CHECK-LABEL: sh_trunc_sh:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    rldicl 3, 3, 51, 13
-; CHECK-NEXT:    srwi 3, 3, 4
+; CHECK-NEXT:    rldicl 3, 3, 47, 36
 ; CHECK-NEXT:    blr
   %s = lshr i64 %x, 13
   %t = trunc i64 %s to i32
diff --git a/llvm/test/CodeGen/X86/shift-amount-mod.ll b/llvm/test/CodeGen/X86/shift-amount-mod.ll
index bccb360..6c0527c 100644
--- a/llvm/test/CodeGen/X86/shift-amount-mod.ll
+++ b/llvm/test/CodeGen/X86/shift-amount-mod.ll
@@ -1564,10 +1564,10 @@ define i16 @sh_trunc_sh(i64 %x) {
 ;
 ; X64-LABEL: sh_trunc_sh:
 ; X64:       # %bb.0:
-; X64-NEXT:    shrq $24, %rdi
-; X64-NEXT:    movzwl %di, %eax
-; X64-NEXT:    shrl $12, %eax
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq $36, %rax
+; X64-NEXT:    andl $15, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $rax
 ; X64-NEXT:    retq
   %s = lshr i64 %x, 24
   %t = trunc i64 %s to i16
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
index c448921..24395c9 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -1399,71 +1399,77 @@ define <4 x i32> @sh_trunc_sh_vec(<4 x i64> %x) {
 ; AVX1-LABEL: sh_trunc_sh_vec:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpsrlq $24, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlq $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $36, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $36, %xmm0, %xmm0
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT:    vpsrld $12, %xmm0, %xmm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: sh_trunc_sh_vec:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrlq $24, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $36, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-NEXT:    vpsrld $12, %xmm0, %xmm0
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
+; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: sh_trunc_sh_vec:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[3,4,5,6,11,12,13,14],xmm1[3,4,5,6,11,12,13,14]
-; XOPAVX1-NEXT:    vpsrld $12, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpsrlq $36, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsrlq $36, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vzeroupper
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: sh_trunc_sh_vec:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpsrlq $24, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpsrlq $36, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; XOPAVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; XOPAVX2-NEXT:    vpsrld $12, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
+; XOPAVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vzeroupper
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: sh_trunc_sh_vec:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsrlq $24, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlq $36, %ymm0, %ymm0
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512-NEXT:    vpsrld $12, %xmm0, %xmm0
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sh_trunc_sh_vec:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpsrlq $24, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlq $36, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
-; AVX512VL-NEXT:    vpsrld $12, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; X32-AVX1-LABEL: sh_trunc_sh_vec:
 ; X32-AVX1:       # %bb.0:
 ; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X32-AVX1-NEXT:    vpsrlq $24, %xmm1, %xmm1
-; X32-AVX1-NEXT:    vpsrlq $24, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsrlq $36, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpsrlq $36, %xmm0, %xmm0
 ; X32-AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; X32-AVX1-NEXT:    vpsrld $12, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vandps {{\.LCPI.*}}, %xmm0, %xmm0
 ; X32-AVX1-NEXT:    vzeroupper
 ; X32-AVX1-NEXT:    retl
 ;
 ; X32-AVX2-LABEL: sh_trunc_sh_vec:
 ; X32-AVX2:       # %bb.0:
-; X32-AVX2-NEXT:    vpsrlq $24, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrlq $36, %ymm0, %ymm0
 ; X32-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; X32-AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; X32-AVX2-NEXT:    vpsrld $12, %xmm0, %xmm0
+; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
+; X32-AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; X32-AVX2-NEXT:    vzeroupper
 ; X32-AVX2-NEXT:    retl
   %s = lshr <4 x i64> %x, <i64 24, i64 24, i64 24, i64 24>
-- 
2.7.4