From 8f0c78309562cef845cf0e130c8be2ff2a19295b Mon Sep 17 00:00:00 2001
From: Stefan Pintilie <stefanp@ca.ibm.com>
Date: Tue, 24 Oct 2017 17:44:27 +0000
Subject: [PATCH] [PowerPC] Try to simplify a Swap if it feeds a Splat

If we have the situation where a Swap feeds a Splat we can sometimes change the
  index on the Splat and then remove the Swap instruction.

Fixed the test case that was failing and recommit after pulling the original
  commit.

  Original revision is here: https://reviews.llvm.org/D39009

llvm-svn: 316478
---
 llvm/lib/Target/PowerPC/PPCMIPeephole.cpp          |  47 ++++++++
 .../PowerPC/p8-scalar_vector_conversions.ll        |   4 +-
 llvm/test/CodeGen/PowerPC/ppc64-peephole-swap.ll   | 134 +++++++++++++++++++++
 3 files changed, 183 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/ppc64-peephole-swap.ll

diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index 80b7ac2..beb40992 100644
--- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -375,6 +375,53 @@ bool PPCMIPeephole::simplifyCode(void) {
             MI.getOperand(2).setImm(NewElem);
           }
         }
+
+        // Splat is fed by a SWAP which is a permute of this form
+        //  XXPERMDI %VA, %VA, 2
+        // Since the splat instruction can use any of the vector elements to do
+        //  the splat we do not have to rearrange the elements in the vector
+        //  with a swap before we do the splat. We can simply do the splat from
+        //  a different index.
+        // If the swap has only one use (the splat) then we can completely
+        //  remove the swap too.
+        if (DefOpcode == PPC::XXPERMDI && MI.getOperand(1).isImm()) {
+          unsigned SwapRes = DefMI->getOperand(0).getReg();
+          unsigned SwapOp1 = DefMI->getOperand(1).getReg();
+          unsigned SwapOp2 = DefMI->getOperand(2).getReg();
+          unsigned SwapImm = DefMI->getOperand(3).getImm();
+          unsigned SplatImm = MI.getOperand(1).getImm();
+
+          // Break if this permute is not a swap.
+          if (SwapOp1 != SwapOp2 || SwapImm != 2)
+            break;
+
+          unsigned NewElem = 0;
+          // Compute the new index to use for the splat.
+          if (MI.getOpcode() == PPC::VSPLTB)
+            NewElem = (SplatImm + 8) & 0xF;
+          else if (MI.getOpcode() == PPC::VSPLTH)
+            NewElem = (SplatImm + 4) & 0x7;
+          else if (MI.getOpcode() == PPC::XXSPLTW)
+            NewElem = (SplatImm + 2) & 0x3;
+          else {
+            DEBUG(dbgs() << "Unknown splat opcode.");
+            DEBUG(MI.dump());
+            break;
+          }
+
+          if (MRI->hasOneNonDBGUse(SwapRes)) {
+            DEBUG(dbgs() << "Removing redundant swap: ");
+            DEBUG(DefMI->dump());
+            ToErase = DefMI;
+          }
+          Simplified = true;
+          DEBUG(dbgs() << "Changing splat immediate from " << SplatImm <<
+                " to " << NewElem << " in instruction: ");
+          DEBUG(MI.dump());
+          MI.getOperand(1).setImm(NewElem);
+          MI.getOperand(2).setReg(SwapOp1);
+        }
+
         break;
       }
       case PPC::XVCVDPSP: {
diff --git a/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll b/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
index 3c705bb..5b71ae8 100644
--- a/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
+++ b/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
@@ -16,7 +16,7 @@ entry:
 ; CHECK: sldi [[REG1:[0-9]+]], 3, 56
 ; CHECK: mtvsrd {{[0-9]+}}, [[REG1]]
 ; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3
-; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]]
+; CHECK-LE: vspltb {{[0-9]+}}, {{[0-9]+}}, 7
 }
 
 ; Function Attrs: norecurse nounwind readnone
@@ -28,7 +28,7 @@ entry:
 ; CHECK: sldi [[REG1:[0-9]+]], 3, 48
 ; CHECK: mtvsrd {{[0-9]+}}, [[REG1]]
 ; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3
-; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]]
+; CHECK-LE: vsplth {{[0-9]+}}, {{[0-9]+}}, 3
 }
 
 ; Function Attrs: norecurse nounwind readnone
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-peephole-swap.ll b/llvm/test/CodeGen/PowerPC/ppc64-peephole-swap.ll
new file mode 100644
index 0000000..004a856
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/ppc64-peephole-swap.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-PWR8
+
+; The strightforward expansion of this code will result in a swap followed by a
+;  splat. However, the swap is not needed since in this case the splat is the
+;  only use.
+; We want to check that we are not using the swap and that we have indexed the
+;  splat to the correct location.
+; 8 Bit Signed Version of the test.
+; Function Attrs: norecurse nounwind readnone
+define <16 x i8> @splat_8_plus(<16 x i8> %v, i8 signext %c) local_unnamed_addr {
+entry:
+  %splat.splatinsert.i = insertelement <16 x i8> undef, i8 %c, i32 0
+  %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> undef, <16 x i32> zeroinitializer
+  %add = add <16 x i8> %splat.splat.i, %v
+  ret <16 x i8> %add
+; CHECK-LABEL: splat_8_plus
+; CHECK-NOT: xxswapd
+; CHECK: vspltb {{[0-9]+}}, {{[0-9]+}}, 7
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_8_plus
+; CHECK-PWR8-NOT: xxswapd
+; CHECK-PWR8: vspltb {{[0-9]+}}, {{[0-9]+}}, 7
+; CHECK-PWR8: blr
+}
+
+; 8 Bit Unsigned Version of the test.
+; Function Attrs: norecurse nounwind readnone
+define <16 x i8> @splat_u8_plus(<16 x i8> %v, i8 zeroext %c) local_unnamed_addr {
+entry:
+  %splat.splatinsert.i = insertelement <16 x i8> undef, i8 %c, i32 0
+  %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> undef, <16 x i32> zeroinitializer
+  %add = add <16 x i8> %splat.splat.i, %v
+  ret <16 x i8> %add
+; CHECK-LABEL: splat_u8_plus
+; CHECK-NOT: xxswapd
+; CHECK: vspltb {{[0-9]+}}, {{[0-9]+}}, 7
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_u8_plus
+; CHECK-PWR8-NOT: xxswapd
+; CHECK-PWR8: vspltb {{[0-9]+}}, {{[0-9]+}}, 7
+; CHECK-PWR8: blr
+}
+
+; 16 Bit Signed Version of the test.
+; Function Attrs: norecurse nounwind readnone
+define <8 x i16> @splat_16_plus(<8 x i16> %v, i16 signext %c) local_unnamed_addr {
+entry:
+  %0 = shl i16 %c, 8
+  %conv.i = ashr exact i16 %0, 8
+  %splat.splatinsert.i = insertelement <8 x i16> undef, i16 %conv.i, i32 0
+  %splat.splat.i = shufflevector <8 x i16> %splat.splatinsert.i, <8 x i16> undef, <8 x i32> zeroinitializer
+  %add = add <8 x i16> %splat.splat.i, %v
+  ret <8 x i16> %add
+; CHECK-LABEL: splat_16_plus
+; CHECK-NOT: xxswapd
+; CHECK: vsplth {{[0-9]+}}, {{[0-9]+}}, 3
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_16_plus
+; CHECK-PWR8-NOT: xxswapd
+; CHECK-PWR8: vsplth {{[0-9]+}}, {{[0-9]+}}, 3
+; CHECK-PWR8: blr
+}
+
+; 16 Bit Unsigned Version of the test.
+; Function Attrs: norecurse nounwind readnone
+define <8 x i16> @splat_u16_plus(<8 x i16> %v, i16 zeroext %c) local_unnamed_addr {
+entry:
+  %0 = shl i16 %c, 8
+  %conv.i = ashr exact i16 %0, 8
+  %splat.splatinsert.i = insertelement <8 x i16> undef, i16 %conv.i, i32 0
+  %splat.splat.i = shufflevector <8 x i16> %splat.splatinsert.i, <8 x i16> undef, <8 x i32> zeroinitializer
+  %add = add <8 x i16> %splat.splat.i, %v
+  ret <8 x i16> %add
+; CHECK-LABEL: splat_u16_plus
+; CHECK-NOT: xxswapd
+; CHECK: vsplth {{[0-9]+}}, {{[0-9]+}}, 3
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_u16_plus
+; CHECK-PWR8-NOT: xxswapd
+; CHECK-PWR8: vsplth {{[0-9]+}}, {{[0-9]+}}, 3
+; CHECK-PWR8: blr
+}
+
+; 32 Bit Signed Version of the test.
+; The 32 bit examples work differently than the 8 and 16 bit versions of the
+;  test. On Power 9 we have the mtvsrws instruction that does both the move to
+;  register and the splat so it does not really test the newly implemented code.
+; On Power 9 for the 32 bit case we don't need the new simplification. It is
+;  just here for completeness.
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @splat_32_plus(<4 x i32> %v, i32 signext %c) local_unnamed_addr {
+entry:
+  %sext = shl i32 %c, 24
+  %conv.i = ashr exact i32 %sext, 24
+  %splat.splatinsert.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0
+  %splat.splat.i = shufflevector <4 x i32> %splat.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add = add <4 x i32> %splat.splat.i, %v
+  ret <4 x i32> %add
+; CHECK-LABEL: splat_32_plus
+; CHECK-NOT: xxswapd
+; CHECK: mtvsrws {{[0-9]+}}, {{[0-9]+}}
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_32_plus
+; CHECK-PWR8-NOT: xxswapd
+; CHECK-PWR8: xxspltw {{[0-9]+}}, {{[0-9]+}}, 1
+; CHECK-PWR8: blr
+}
+
+; 32 Bit Unsigned Version of the test.
+; The 32 bit examples work differently than the 8 and 16 bit versions of the
+;  test. On Power 9 we have the mtvsrws instruction that does both the move to
+;  register and the splat so it does not really test the newly implemented code.
+; On Power 9 for the 32 bit case we don't need the new simplification. It is
+;  just here for completeness.
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @splat_u32_plus(<4 x i32> %v, i32 zeroext %c) local_unnamed_addr {
+entry:
+  %sext = shl i32 %c, 24
+  %conv.i = ashr exact i32 %sext, 24
+  %splat.splatinsert.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0
+  %splat.splat.i = shufflevector <4 x i32> %splat.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add = add <4 x i32> %splat.splat.i, %v
+  ret <4 x i32> %add
+; CHECK-LABEL: splat_u32_plus
+; CHECK-NOT: xxswapd
+; CHECK: mtvsrws {{[0-9]+}}, {{[0-9]+}}
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_u32_plus
+; CHECK-PWR8-NOT: xxswapd
+; CHECK-PWR8: xxspltw {{[0-9]+}}, {{[0-9]+}}, 1
+; CHECK-PWR8: blr
+}
+
-- 
2.7.4