From e59d06fe78fd3ec58a247b7af42b59013a61ac5e Mon Sep 17 00:00:00 2001
From: Tim Shen <timshen91@gmail.com>
Date: Wed, 3 May 2017 00:07:02 +0000
Subject: [PATCH] [PowerPC, DAGCombiner] Fold a << (b % (sizeof(a) * 8)) back
 to a single instruction

Summary:
This is the corresponding llvm change to D28037 to ensure no performance
regression.

Reviewers: bogner, kbarton, hfinkel, iteratee, echristo

Subscribers: nemanjai, llvm-commits

Differential Revision: https://reviews.llvm.org/D28329

llvm-svn: 301990
---
 llvm/include/llvm/Target/TargetLowering.h     |  8 ++++++
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 33 +++++++++++++++++++++++
 llvm/lib/Target/PowerPC/PPCISelLowering.h     |  8 ++++++
 llvm/test/CodeGen/PowerPC/shift_mask.ll       | 39 ---------------------------
 4 files changed, 49 insertions(+), 39 deletions(-)

diff --git a/llvm/include/llvm/Target/TargetLowering.h b/llvm/include/llvm/Target/TargetLowering.h
index aa92300..ced1838 100644
--- a/llvm/include/llvm/Target/TargetLowering.h
+++ b/llvm/include/llvm/Target/TargetLowering.h
@@ -2061,6 +2061,14 @@ public:
     return false;
   }
 
+  // Return true if the instruction that performs a << b actually performs
+  // a << (b % (sizeof(a) * 8)).
+  virtual bool supportsModuloShift(ISD::NodeType Inst, EVT ReturnType) const {
+    assert((Inst == ISD::SHL || Inst == ISD::SRA || Inst == ISD::SRL) &&
+           "Expect a shift instruction");
+    return false;
+  }
+
   //===--------------------------------------------------------------------===//
   // Runtime Library hooks
   //
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index dc0276d..3cd5573 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5294,6 +5294,17 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     }
   }
 
+  // If the target supports masking y in (shl, y),
+  // fold (shl x, (and y, ((1 << numbits(x)) - 1))) -> (shl x, y)
+  if (TLI.isOperationLegal(ISD::SHL, VT) &&
+      TLI.supportsModuloShift(ISD::SHL, VT) && N1->getOpcode() == ISD::AND) {
+    if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) {
+      if (Mask->getZExtValue() == OpSizeInBits - 1) {
+        return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N1->getOperand(0));
+      }
+    }
+  }
+
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
 
   // fold (shl c1, c2) -> c1<<c2
@@ -5492,6 +5503,17 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
+  // If the target supports masking y in (sra, y),
+  // fold (sra x, (and y, ((1 << numbits(x)) - 1))) -> (sra x, y)
+  if (TLI.isOperationLegal(ISD::SRA, VT) &&
+      TLI.supportsModuloShift(ISD::SRA, VT) && N1->getOpcode() == ISD::AND) {
+    if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) {
+      if (Mask->getZExtValue() == OpSizeInBits - 1) {
+        return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, N1->getOperand(0));
+      }
+    }
+  }
+
   // Arithmetic shifting an all-sign-bit value is a no-op.
   if (DAG.ComputeNumSignBits(N0) == OpSizeInBits)
     return N0;
@@ -5650,6 +5672,17 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
+  // If the target supports masking y in (srl, y),
+  // fold (srl x, (and y, ((1 << numbits(x)) - 1))) -> (srl x, y)
+  if (TLI.isOperationLegal(ISD::SRL, VT) &&
+      TLI.supportsModuloShift(ISD::SRL, VT) && N1->getOpcode() == ISD::AND) {
+    if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) {
+      if (Mask->getZExtValue() == OpSizeInBits - 1) {
+        return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1->getOperand(0));
+      }
+    }
+  }
+
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 5645fdc..3266109 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1017,6 +1017,14 @@ namespace llvm {
     SDValue
     combineElementTruncationToVectorTruncation(SDNode *N,
                                                DAGCombinerInfo &DCI) const;
+
+    bool supportsModuloShift(ISD::NodeType Inst,
+                             EVT ReturnType) const override {
+      assert((Inst == ISD::SHL || Inst == ISD::SRA || Inst == ISD::SRL) &&
+             "Expect a shift instruction");
+      assert(isOperationLegal(Inst, ReturnType));
+      return ReturnType.isVector();
+    }
   };
 
   namespace PPC {
diff --git a/llvm/test/CodeGen/PowerPC/shift_mask.ll b/llvm/test/CodeGen/PowerPC/shift_mask.ll
index 91226a3..e9ca9b0 100644
--- a/llvm/test/CodeGen/PowerPC/shift_mask.ll
+++ b/llvm/test/CodeGen/PowerPC/shift_mask.ll
@@ -49,8 +49,6 @@ define i64 @test003(i64 %a, i64 %b) {
 define <16 x i8> @test010(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: test010:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltisb 4, 7
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vslb 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <16 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -61,8 +59,6 @@ define <16 x i8> @test010(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i16> @test011(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test011:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltish 4, 15
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vslh 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <8 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
@@ -73,10 +69,6 @@ define <8 x i16> @test011(<8 x i16> %a, <8 x i16> %b) {
 define <4 x i32> @test012(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test012:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltisw 4, -16
-; CHECK-NEXT:    vspltisw 5, 15
-; CHECK-NEXT:    vsubuwm 4, 5, 4
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vslw 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
@@ -87,11 +79,6 @@ define <4 x i32> @test012(<4 x i32> %a, <4 x i32> %b) {
 define <2 x i64> @test013(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test013:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    addis 3, 2, .LCPI7_0@toc@ha
-; CHECK-NEXT:    addi 3, 3, .LCPI7_0@toc@l
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    xxswapd 36, 0
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsld 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <2 x i64> %b, <i64 63, i64 63>
@@ -148,8 +135,6 @@ define i64 @test103(i64 %a, i64 %b) {
 define <16 x i8> @test110(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: test110:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltisb 4, 7
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsrb 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <16 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -160,8 +145,6 @@ define <16 x i8> @test110(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i16> @test111(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test111:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltish 4, 15
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsrh 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <8 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
@@ -172,10 +155,6 @@ define <8 x i16> @test111(<8 x i16> %a, <8 x i16> %b) {
 define <4 x i32> @test112(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test112:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltisw 4, -16
-; CHECK-NEXT:    vspltisw 5, 15
-; CHECK-NEXT:    vsubuwm 4, 5, 4
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsrw 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
@@ -186,11 +165,6 @@ define <4 x i32> @test112(<4 x i32> %a, <4 x i32> %b) {
 define <2 x i64> @test113(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test113:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    addis 3, 2, .LCPI15_0@toc@ha
-; CHECK-NEXT:    addi 3, 3, .LCPI15_0@toc@l
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    xxswapd 36, 0
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsrd 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <2 x i64> %b, <i64 63, i64 63>
@@ -247,8 +221,6 @@ define i64 @test203(i64 %a, i64 %b) {
 define <16 x i8> @test210(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: test210:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltisb 4, 7
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsrab 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <16 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -259,8 +231,6 @@ define <16 x i8> @test210(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i16> @test211(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test211:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltish 4, 15
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsrah 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <8 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
@@ -271,10 +241,6 @@ define <8 x i16> @test211(<8 x i16> %a, <8 x i16> %b) {
 define <4 x i32> @test212(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test212:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltisw 4, -16
-; CHECK-NEXT:    vspltisw 5, 15
-; CHECK-NEXT:    vsubuwm 4, 5, 4
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsraw 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
@@ -285,11 +251,6 @@ define <4 x i32> @test212(<4 x i32> %a, <4 x i32> %b) {
 define <2 x i64> @test213(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test213:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    addis 3, 2, .LCPI23_0@toc@ha
-; CHECK-NEXT:    addi 3, 3, .LCPI23_0@toc@l
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    xxswapd 36, 0
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsrad 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <2 x i64> %b, <i64 63, i64 63>
-- 
2.7.4