From d62d8b771d3cc51b0b9e095011c7a1a6f83ce477 Mon Sep 17 00:00:00 2001
From: Kyle Butt <kyle+llvm@iteratee.net>
Date: Wed, 3 Feb 2016 01:41:09 +0000
Subject: [PATCH] Codegen: [PPC] Fix PPCVSXFMAMutate to handle duplicates.

The purpose of PPCVSXFMAMutate is to elide copies by changing FMA forms
on PPC.

    %vreg6<def> = COPY %vreg96
    %vreg6<def,tied1> = XSMADDASP %vreg6<tied0>, %vreg5<kill>, %vreg7
    ;v6 = v6 + v5 * v7

is replaced by

    %vreg5<def,tied1> = XSMADDMSP %vreg5<tied0>, %vreg7, %vreg96
    ;v5 = v5 * v7 + v96

This was broken in the case where the target register was also used as a
multiplicand. Fix this case by checking for it and replacing both uses
with the copied register.

    %vreg6<def> = COPY %vreg96
    %vreg6<def,tied1> = XSMADDASP %vreg6<tied0>, %vreg5<kill>, %vreg6
    ;v6 = v6 + v5 * v6

is replaced by

    %vreg5<def,tied1> = XSMADDMSP %vreg5<tied0>, %vreg96, %vreg96
    ;v5 = v5 * v96 + v96

llvm-svn: 259617
---
 llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp        | 51 ++++++++++++++--------
 .../CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll   | 36 +++++++++++++++
 2 files changed, 68 insertions(+), 19 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll
diff --git a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index 6b19a2f..e90dc97 100644
--- a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -168,21 +168,32 @@ protected:
         if (OtherUsers || KillsAddendSrc)
           continue;
 
-        // Find one of the product operands that is killed by this instruction.
 
+        // The transformation doesn't work well with things like:
+        //    %vreg5 = A-form-op %vreg5, %vreg11, %vreg5;
+        // unless vreg11 is also a kill, so skip when it is not,
+        // and check operand 3 to see it is also a kill to handle the case:
+        //   %vreg5 = A-form-op %vreg5, %vreg5, %vreg11;
+        // where vreg5 and vreg11 are both kills. This case would be skipped
+        // otherwise.
+        unsigned OldFMAReg = MI->getOperand(0).getReg();
+
+        // Find one of the product operands that is killed by this instruction.
         unsigned KilledProdOp = 0, OtherProdOp = 0;
-        if (LIS->getInterval(MI->getOperand(2).getReg())
-                     .Query(FMAIdx).isKill()) {
+        unsigned Reg2 = MI->getOperand(2).getReg();
+        unsigned Reg3 = MI->getOperand(3).getReg();
+        if (LIS->getInterval(Reg2).Query(FMAIdx).isKill()
+            && Reg2 != OldFMAReg) {
           KilledProdOp = 2;
           OtherProdOp  = 3;
-        } else if (LIS->getInterval(MI->getOperand(3).getReg())
-                     .Query(FMAIdx).isKill()) {
+        } else if (LIS->getInterval(Reg3).Query(FMAIdx).isKill()
+            && Reg3 != OldFMAReg) {
           KilledProdOp = 3;
           OtherProdOp  = 2;
         }
 
-        // If there are no killed product operands, then this transformation is
-        // likely not profitable.
+        // If there are no usable killed product operands, then this
+        // transformation is likely not profitable.
         if (!KilledProdOp)
           continue;
 
@@ -212,14 +223,6 @@ protected:
         bool KilledProdRegUndef = MI->getOperand(KilledProdOp).isUndef();
         bool OtherProdRegUndef  = MI->getOperand(OtherProdOp).isUndef();
 
-        unsigned OldFMAReg = MI->getOperand(0).getReg();
-
-        // The transformation doesn't work well with things like:
-        //    %vreg5 = A-form-op %vreg5, %vreg11, %vreg5;
-        // so leave such things alone.
-        if (OldFMAReg == KilledProdReg)
-          continue;
-
         // If there isn't a class that fits, we can't perform the transform.
         // This is needed for correctness with a mixture of VSX and Altivec
         // instructions to make sure that a low VSX register is not assigned to
@@ -236,23 +239,33 @@ protected:
         MI->getOperand(0).setReg(KilledProdReg);
         MI->getOperand(1).setReg(KilledProdReg);
         MI->getOperand(3).setReg(AddendSrcReg);
-        MI->getOperand(2).setReg(OtherProdReg);
 
         MI->getOperand(0).setSubReg(KilledProdSubReg);
         MI->getOperand(1).setSubReg(KilledProdSubReg);
         MI->getOperand(3).setSubReg(AddSubReg);
-        MI->getOperand(2).setSubReg(OtherProdSubReg);
 
         MI->getOperand(1).setIsKill(KilledProdRegKill);
         MI->getOperand(3).setIsKill(AddRegKill);
-        MI->getOperand(2).setIsKill(OtherProdRegKill);
 
         MI->getOperand(1).setIsUndef(KilledProdRegUndef);
         MI->getOperand(3).setIsUndef(AddRegUndef);
-        MI->getOperand(2).setIsUndef(OtherProdRegUndef);
 
         MI->setDesc(TII->get(AltOpc));
 
+        // If the addend is also a multiplicand, replace it with the addend
+        // source in both places.
+        if (OtherProdReg == AddendMI->getOperand(0).getReg()) {
+          MI->getOperand(2).setReg(AddendSrcReg);
+          MI->getOperand(2).setSubReg(AddSubReg);
+          MI->getOperand(2).setIsKill(AddRegKill);
+          MI->getOperand(2).setIsUndef(AddRegUndef);
+        } else {
+          MI->getOperand(2).setReg(OtherProdReg);
+          MI->getOperand(2).setSubReg(OtherProdSubReg);
+          MI->getOperand(2).setIsKill(OtherProdRegKill);
+          MI->getOperand(2).setIsUndef(OtherProdRegUndef);
+        }
+
         DEBUG(dbgs() << " -> " << *MI);
 
         // The killed product operand was killed here, so we can reuse it now
diff --git a/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll b/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll
new file mode 100644
index 0000000..08d1b1b
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll
@@ -0,0 +1,36 @@
+; RUN: llc -fp-contract=fast -O2 < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-grtev4-linux-gnu"
+
+; CHECK-LABEL: f
+; CHECK-NOT: xsmaddmsp [[REG:[0-9]+]], [[REG]], {{[0-9]+}}
+define float @f(float %xf) #0 {
+  %1 = fmul float %xf, %xf
+  %2 = fmul float %1, 0x3F43FB0140000000
+  %3 = fsub float 1.000000e+00, %2
+  %4 = fmul float %1, %3
+  %5 = fmul float %4, 0x3F461C5440000000
+  %6 = fsub float 1.000000e+00, %5
+  %7 = fmul float %1, %6
+  %8 = fmul float %7, 0x3F4899C100000000
+  %9 = fsub float 1.000000e+00, %8
+  %10 = fmul float %1, %9
+  %11 = fmul float %10, 0x3F4B894020000000
+  %12 = fsub float 1.000000e+00, %11
+  %13 = fmul float %1, %12
+  %14 = fmul float %13, 0x3F4F07C200000000
+  %15 = fsub float 1.000000e+00, %14
+  %16 = fmul float %1, %15
+  %17 = fmul float %16, 0x3F519E0120000000
+  %18 = fsub float 1.000000e+00, %17
+  %19 = fmul float %1, %18
+  %20 = fmul float %19, 0x3F542D6620000000
+  %21 = fsub float 1.000000e+00, %20
+  %22 = fmul float %1, %21
+  %23 = fmul float %22, 0x3F5756CAC0000000
+  %24 = fsub float 1.000000e+00, %23
+  %25 = fmul float %1, %24
+  ret float %25
+}
+
+attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" }
-- 
2.7.4