From 376e17364f2b3d95fbdcd985cfdc157fb0292ee6 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Fri, 29 May 2015 17:13:25 +0000
Subject: [PATCH] Add support for VSX FMA single-precision instructions to the
 PPC back end

This patch corresponds to review:
http://reviews.llvm.org/D9941

It adds the various FMA instructions introduced in the version 2.07 of
the ISA along with the testing for them. These are operations on single
precision scalar values in VSX registers.

llvm-svn: 238578
---
 llvm/lib/Target/PowerPC/PPCFastISel.cpp   |  27 +++--
 llvm/lib/Target/PowerPC/PPCInstrVSX.td    |  76 ++++++++++++++
 llvm/test/CodeGen/PowerPC/fma.ll          |  83 +++++++++++++++
 llvm/test/CodeGen/PowerPC/vsx-fma-sp.ll   | 167 ++++++++++++++++++++++++++++++
 llvm/test/MC/Disassembler/PowerPC/vsx.txt |  24 +++++
 llvm/test/MC/PowerPC/vsx.s                |  24 +++++
 6 files changed, 392 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/vsx-fma-sp.ll

diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index 0b8e23c..a561d5b 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -148,6 +148,9 @@ class PPCFastISel final : public FastISel {
     bool isVSFRCRegister(unsigned Register) const {
       return MRI.getRegClass(Register)->getID() == PPC::VSFRCRegClassID;
     }
+    bool isVSSRCRegister(unsigned Register) const {
+      return MRI.getRegClass(Register)->getID() == PPC::VSSRCRegClassID;
+    }
     bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value,
                     bool isZExt, unsigned DestReg);
     bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
@@ -503,8 +506,11 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
 
   // If this is a potential VSX load with an offset of 0, a VSX indexed load can
   // be used.
+  bool IsVSSRC = (ResultReg != 0) && isVSSRCRegister(ResultReg);
   bool IsVSFRC = (ResultReg != 0) && isVSFRCRegister(ResultReg);
-  if (IsVSFRC && (Opc == PPC::LFD) && 
+  bool Is32VSXLoad = IsVSSRC && Opc == PPC::LFS;
+  bool Is64VSXLoad = IsVSSRC && Opc == PPC::LFD;
+  if ((Is32VSXLoad || Is64VSXLoad) &&
       (Addr.BaseType != Address::FrameIndexBase) && UseOffset &&
       (Addr.Offset == 0)) {
     UseOffset = false;
@@ -518,7 +524,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
   // into a RegBase.
   if (Addr.BaseType == Address::FrameIndexBase) {
     // VSX only provides an indexed load.
-    if (IsVSFRC && Opc == PPC::LFD) return false;
+    if (Is32VSXLoad || Is64VSXLoad) return false;
 
     MachineMemOperand *MMO =
       FuncInfo.MF->getMachineMemOperand(
@@ -532,7 +538,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
   // Base reg with offset in range.
   } else if (UseOffset) {
     // VSX only provides an indexed load.
-    if (IsVSFRC && Opc == PPC::LFD) return false;
+    if (Is32VSXLoad || Is64VSXLoad) return false;
 
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addImm(Addr.Offset).addReg(Addr.Base.Reg);
@@ -555,7 +561,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
       case PPC::LWA:    Opc = PPC::LWAX;    break;
       case PPC::LWA_32: Opc = PPC::LWAX_32; break;
       case PPC::LD:     Opc = PPC::LDX;     break;
-      case PPC::LFS:    Opc = PPC::LFSX;    break;
+      case PPC::LFS:    Opc = IsVSSRC ? PPC::LXSSPX : PPC::LFSX; break;
       case PPC::LFD:    Opc = IsVSFRC ? PPC::LXSDX : PPC::LFDX; break;
     }
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
@@ -636,9 +642,12 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
 
   // If this is a potential VSX store with an offset of 0, a VSX indexed store
   // can be used.
+  bool IsVSSRC = isVSSRCRegister(SrcReg);
   bool IsVSFRC = isVSFRCRegister(SrcReg);
-  if (IsVSFRC && (Opc == PPC::STFD) && 
-      (Addr.BaseType != Address::FrameIndexBase) && UseOffset && 
+  bool Is32VSXStore = IsVSSRC && Opc == PPC::STFS;
+  bool Is64VSXStore = IsVSFRC && Opc == PPC::STFD;
+  if ((Is32VSXStore || Is64VSXStore) &&
+      (Addr.BaseType != Address::FrameIndexBase) && UseOffset &&
       (Addr.Offset == 0)) {
     UseOffset = false;
   }
@@ -648,7 +657,7 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
   // into a RegBase.
   if (Addr.BaseType == Address::FrameIndexBase) {
     // VSX only provides an indexed store.
-    if (IsVSFRC && Opc == PPC::STFD) return false;
+    if (Is32VSXStore || Is64VSXStore) return false;
 
     MachineMemOperand *MMO =
       FuncInfo.MF->getMachineMemOperand(
@@ -665,7 +674,7 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
   // Base reg with offset in range.
   } else if (UseOffset) {
     // VSX only provides an indexed store.
-    if (IsVSFRC && Opc == PPC::STFD) return false;
+    if (Is32VSXStore || Is64VSXStore) return false;
     
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
       .addReg(SrcReg).addImm(Addr.Offset).addReg(Addr.Base.Reg);
@@ -684,7 +693,7 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
       case PPC::STH8: Opc = PPC::STHX8; break;
       case PPC::STW8: Opc = PPC::STWX8; break;
       case PPC::STD:  Opc = PPC::STDX;  break;
-      case PPC::STFS: Opc = PPC::STFSX; break;
+      case PPC::STFS: Opc = IsVSSRC ? PPC::STXSSPX : PPC::STFSX; break;
       case PPC::STFD: Opc = IsVSFRC ? PPC::STXSDX : PPC::STFDX; break;
     }
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 9685bac..d08b808 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1078,6 +1078,82 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                         (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
                         "xssubsp $XT, $XA, $XB", IIC_VecFP,
                         [(set f32:$XT, (fsub f32:$XA, f32:$XB))]>;
+
+  // FMA Instructions
+  let BaseName = "XSMADDASP" in {
+  let isCommutable = 1 in
+  def XSMADDASP : XX3Form<60, 1,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsmaddasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f32:$XT, (fma f32:$XA, f32:$XB, f32:$XTi))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSMADDMSP : XX3Form<60, 9,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSMSUBASP" in {
+  let isCommutable = 1 in
+  def XSMSUBASP : XX3Form<60, 17,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsmsubasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f32:$XT, (fma f32:$XA, f32:$XB,
+                                              (fneg f32:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSMSUBMSP : XX3Form<60, 25,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSNMADDASP" in {
+  let isCommutable = 1 in
+  def XSNMADDASP : XX3Form<60, 129,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsnmaddasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f32:$XT, (fneg (fma f32:$XA, f32:$XB,
+                                                    f32:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSNMADDMSP : XX3Form<60, 137,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSNMSUBASP" in {
+  let isCommutable = 1 in
+  def XSNMSUBASP : XX3Form<60, 145,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsnmsubasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f32:$XT, (fneg (fma f32:$XA, f32:$XB,
+                                                    (fneg f32:$XTi))))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSNMSUBMSP : XX3Form<60, 153,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
 } // AddedComplexity = 400
 } // HasP8Vector
 
diff --git a/llvm/test/CodeGen/PowerPC/fma.ll b/llvm/test/CodeGen/PowerPC/fma.ll
index ab5251b..9cfef39 100644
--- a/llvm/test/CodeGen/PowerPC/fma.ll
+++ b/llvm/test/CodeGen/PowerPC/fma.ll
@@ -1,9 +1,12 @@
 ; RUN: llc < %s -march=ppc32 -fp-contract=fast -mattr=-vsx | FileCheck %s
 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mattr=+vsx -mcpu=pwr7 | FileCheck -check-prefix=CHECK-VSX %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mcpu=pwr8 | FileCheck -check-prefix=CHECK-P8 %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -fp-contract=fast -mcpu=pwr8 | FileCheck -check-prefix=CHECK-P8 %s
 
 declare double @dummy1(double) #0
 declare double @dummy2(double, double) #0
 declare double @dummy3(double, double, double) #0
+declare float @dummy4(float, float) #0
 
 define double @test_FMADD1(double %A, double %B, double %C) {
 	%D = fmul double %A, %B		; <double> [#uses=1]
@@ -126,3 +129,83 @@ define float @test_FNMSUBS(float %A, float %B, float %C) {
 ; CHECK-VSX: fnmsubs
 ; CHECK-VSX-NEXT: blr
 }
+
+define float @test_XSMADDMSP(float %A, float %B, float %C) {
+	%D = fmul float %A, %B		; <float> [#uses=1]
+	%E = fadd float %C, %D		; <float> [#uses=1]
+	ret float %E
+; CHECK-P8-LABEL: test_XSMADDMSP:
+; CHECK-P8: xsmaddmsp
+; CHECK-P8-NEXT: blr
+}
+
+define float @test_XSMSUBMSP(float %A, float %B, float %C) {
+	%D = fmul float %A, %B		; <float> [#uses=1]
+	%E = fsub float %D, %C		; <float> [#uses=1]
+	ret float %E
+; CHECK-P8-LABEL: test_XSMSUBMSP:
+; CHECK-P8: xsmsubmsp
+; CHECK-P8-NEXT: blr
+}
+
+define float @test_XSMADDASP(float %A, float %B, float %C, float %D) {
+	%E = fmul float %A, %B 	; <float> [#uses=2]
+	%F = fadd float %E, %C 	; <float> [#uses=1]
+	%G = fsub float %E, %D 	; <float> [#uses=1]
+	%H = call float @dummy4(float %F, float %G)      ; <float> [#uses=1]
+	ret float %H
+; CHECK-P8-LABEL: test_XSMADDASP:
+; CHECK-P8: xsmaddasp
+; CHECK-P8-NEXT: xsmsubmsp
+}
+
+define float @test_XSMSUBASP(float %A, float %B, float %C, float %D) {
+	%E = fmul float %A, %B 	; <float> [#uses=2]
+	%F = fsub float %E, %C 	; <float> [#uses=1]
+	%G = fsub float %E, %D 	; <float> [#uses=1]
+	%H = call float @dummy4(float %F, float %G)      ; <float> [#uses=1]
+	ret float %H
+; CHECK-P8-LABEL: test_XSMSUBASP:
+; CHECK-P8: xsmsubasp
+; CHECK-P8-NEXT: xsmsubmsp
+}
+
+define float @test_XSNMADDMSP(float %A, float %B, float %C) {
+	%D = fmul float %A, %B		; <float> [#uses=1]
+	%E = fadd float %D, %C		; <float> [#uses=1]
+	%F = fsub float -0.000000e+00, %E		; <float> [#uses=1]
+	ret float %F
+; CHECK-P8-LABEL: test_XSNMADDMSP:
+; CHECK-P8: xsnmaddmsp
+; CHECK-P8-NEXT: blr
+}
+
+define float @test_XSNMSUBMSP(float %A, float %B, float %C) {
+	%D = fmul float %A, %B		; <float> [#uses=1]
+	%E = fsub float %D, %C		; <float> [#uses=1]
+	%F = fsub float -0.000000e+00, %E		; <float> [#uses=1]
+	ret float %F
+; CHECK-P8-LABEL: test_XSNMSUBMSP:
+; CHECK-P8: xsnmsubmsp
+; CHECK-P8-NEXT: blr
+}
+
+define float @test_XSNMADDASP(float %A, float %B, float %C) {
+	%D = fmul float %A, %B		; <float> [#uses=1]
+	%E = fadd float %D, %C		; <float> [#uses=1]
+	%F = fsub float -0.000000e+00, %E		; <float> [#uses=1]
+	%H = call float @dummy4(float %E, float %F)      ; <float> [#uses=1]
+	ret float %F
+; CHECK-P8-LABEL: test_XSNMADDASP:
+; CHECK-P8: xsnmaddasp
+}
+
+define float @test_XSNMSUBASP(float %A, float %B, float %C) {
+	%D = fmul float %A, %B		; <float> [#uses=1]
+	%E = fsub float %D, %C		; <float> [#uses=1]
+	%F = fsub float -0.000000e+00, %E		; <float> [#uses=1]
+	%H = call float @dummy4(float %E, float %F)      ; <float> [#uses=1]
+	ret float %F
+; CHECK-P8-LABEL: test_XSNMSUBASP:
+; CHECK-P8: xsnmsubasp
+}
diff --git a/llvm/test/CodeGen/PowerPC/vsx-fma-sp.ll b/llvm/test/CodeGen/PowerPC/vsx-fma-sp.ll
new file mode 100644
index 0000000..1c3e457
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vsx-fma-sp.ll
@@ -0,0 +1,167 @@
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mattr=+vsx | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mattr=+vsx -fast-isel -O0 | FileCheck -check-prefix=CHECK-FISL %s
+define void @test1sp(float %a, float %b, float %c, float %e, float* nocapture %d) #0 {
+entry:
+  %0 = tail call float @llvm.fma.f32(float %b, float %c, float %a)
+  store float %0, float* %d, align 4
+  %1 = tail call float @llvm.fma.f32(float %b, float %e, float %a)
+  %arrayidx1 = getelementptr inbounds float, float* %d, i64 1
+  store float %1, float* %arrayidx1, align 4
+  ret void
+
+; CHECK-LABEL: @test1sp
+; CHECK-DAG: li [[C1:[0-9]+]], 4
+; CHECK-DAG: xsmaddmsp 3, 2, 1
+; CHECK-DAG: xsmaddasp 1, 2, 4
+; CHECK-DAG: stxsspx 3, 0, 7
+; CHECK-DAG: stxsspx 1, 7, [[C1]]
+; CHECK: blr
+
+; CHECK-FISL-LABEL: @test1sp
+; CHECK-FISL-DAG: fmr 0, 1
+; CHECK-FISL-DAG: xsmaddasp 0, 2, 3
+; CHECK-FISL-DAG: stxsspx 0, 0, 7
+; CHECK-FISL-DAG: xsmaddasp 1, 2, 4
+; CHECK-FISL-DAG: li [[C1:[0-9]+]], 4
+; CHECK-FISL-DAG: stxsspx 1, 7, [[C1]]
+; CHECK-FISL: blr
+}
+
+define void @test2sp(float %a, float %b, float %c, float %e, float %f, float* nocapture %d) #0 {
+entry:
+  %0 = tail call float @llvm.fma.f32(float %b, float %c, float %a)
+  store float %0, float* %d, align 4
+  %1 = tail call float @llvm.fma.f32(float %b, float %e, float %a)
+  %arrayidx1 = getelementptr inbounds float, float* %d, i64 1
+  store float %1, float* %arrayidx1, align 4
+  %2 = tail call float @llvm.fma.f32(float %b, float %f, float %a)
+  %arrayidx2 = getelementptr inbounds float, float* %d, i64 2
+  store float %2, float* %arrayidx2, align 4
+  ret void
+
+; CHECK-LABEL: @test2sp
+; CHECK-DAG: li [[C1:[0-9]+]], 4
+; CHECK-DAG: li [[C2:[0-9]+]], 8
+; CHECK-DAG: xsmaddmsp 3, 2, 1
+; CHECK-DAG: xsmaddmsp 4, 2, 1
+; CHECK-DAG: xsmaddasp 1, 2, 5
+; CHECK-DAG: stxsspx 3, 0, 8
+; CHECK-DAG: stxsspx 4, 8, [[C1]]
+; CHECK-DAG: stxsspx 1, 8, [[C2]]
+; CHECK: blr
+
+; CHECK-FISL-LABEL: @test2sp
+; CHECK-FISL-DAG: fmr 0, 1
+; CHECK-FISL-DAG: xsmaddasp 0, 2, 3
+; CHECK-FISL-DAG: stxsspx 0, 0, 8
+; CHECK-FISL-DAG: fmr 0, 1
+; CHECK-FISL-DAG: xsmaddasp 0, 2, 4
+; CHECK-FISL-DAG: li [[C1:[0-9]+]], 4
+; CHECK-FISL-DAG: stxsspx 0, 8, [[C1]]
+; CHECK-FISL-DAG: xsmaddasp 1, 2, 5
+; CHECK-FISL-DAG: li [[C2:[0-9]+]], 8
+; CHECK-FISL-DAG: stxsspx 1, 8, [[C2]]
+; CHECK-FISL: blr
+}
+
+define void @test3sp(float %a, float %b, float %c, float %e, float %f, float* nocapture %d) #0 {
+entry:
+  %0 = tail call float @llvm.fma.f32(float %b, float %c, float %a)
+  store float %0, float* %d, align 4
+  %1 = tail call float @llvm.fma.f32(float %b, float %e, float %a)
+  %2 = tail call float @llvm.fma.f32(float %b, float %c, float %1)
+  %arrayidx1 = getelementptr inbounds float, float* %d, i64 3
+  store float %2, float* %arrayidx1, align 4
+  %3 = tail call float @llvm.fma.f32(float %b, float %f, float %a)
+  %arrayidx2 = getelementptr inbounds float, float* %d, i64 2
+  store float %3, float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float, float* %d, i64 1
+  store float %1, float* %arrayidx3, align 4
+  ret void
+
+; CHECK-LABEL: @test3sp
+; CHECK-DAG: fmr [[F1:[0-9]+]], 1
+; CHECK-DAG: li [[C1:[0-9]+]], 12
+; CHECK-DAG: li [[C2:[0-9]+]], 8
+; CHECK-DAG: li [[C3:[0-9]+]], 4
+; CHECK-DAG: xsmaddmsp 4, 2, 1
+; CHECK-DAG: xsmaddasp 1, 2, 5
+
+; Note: We could convert this next FMA to M-type as well, but it would require
+; re-ordering the instructions.
+; CHECK-DAG: xsmaddasp [[F1]], 2, 3
+
+; CHECK-DAG: xsmaddmsp 3, 2, 4
+; CHECK-DAG: stxsspx [[F1]], 0, 8
+; CHECK-DAG: stxsspx 3, 8, [[C1]]
+; CHECK-DAG: stxsspx 1, 8, [[C2]]
+; CHECK-DAG: stxsspx 4, 8, [[C3]]
+; CHECK: blr
+
+; CHECK-FISL-LABEL: @test3sp
+; CHECK-FISL-DAG: fmr [[F1:[0-9]+]], 1
+; CHECK-FISL-DAG: xsmaddasp [[F1]], 2, 4
+; CHECK-FISL-DAG: fmr 4, [[F1]]
+; CHECK-FISL-DAG: xsmaddasp 4, 2, 3
+; CHECK-FISL-DAG: li [[C1:[0-9]+]], 12
+; CHECK-FISL-DAG: stxsspx 4, 8, [[C1]]
+; CHECK-FISL-DAG: xsmaddasp 1, 2, 5
+; CHECK-FISL-DAG: li [[C2:[0-9]+]], 8
+; CHECK-FISL-DAG: stxsspx 1, 8, [[C2]]
+; CHECK-FISL-DAG: li [[C3:[0-9]+]], 4
+; CHECK-FISL-DAG: stxsspx 0, 8, [[C3]]
+; CHECK-FISL: blr
+}
+
+define void @test4sp(float %a, float %b, float %c, float %e, float %f, float* nocapture %d) #0 {
+entry:
+  %0 = tail call float @llvm.fma.f32(float %b, float %c, float %a)
+  store float %0, float* %d, align 4
+  %1 = tail call float @llvm.fma.f32(float %b, float %e, float %a)
+  %arrayidx1 = getelementptr inbounds float, float* %d, i64 1
+  store float %1, float* %arrayidx1, align 4
+  %2 = tail call float @llvm.fma.f32(float %b, float %c, float %1)
+  %arrayidx3 = getelementptr inbounds float, float* %d, i64 3
+  store float %2, float* %arrayidx3, align 4
+  %3 = tail call float @llvm.fma.f32(float %b, float %f, float %a)
+  %arrayidx4 = getelementptr inbounds float, float* %d, i64 2
+  store float %3, float* %arrayidx4, align 4
+  ret void
+
+; CHECK-LABEL: @test4sp
+; CHECK-DAG: fmr [[F1:[0-9]+]], 1
+; CHECK-DAG: li [[C1:[0-9]+]], 4
+; CHECK-DAG: li [[C2:[0-9]+]], 8
+; CHECK-DAG: xsmaddmsp 4, 2, 1
+
+; Note: We could convert this next FMA to M-type as well, but it would require
+; re-ordering the instructions.
+; CHECK-DAG: xsmaddasp 1, 2, 5
+
+; CHECK-DAG: xsmaddasp [[F1]], 2, 3
+; CHECK-DAG: stxsspx [[F1]], 0, 8
+; CHECK-DAG: stxsspx 4, 8, [[C1]]
+; CHECK-DAG: li [[C3:[0-9]+]], 12
+; CHECK-DAG: xsmaddasp 4, 2, 3
+; CHECK-DAG: stxsspx 4, 8, [[C3]]
+; CHECK-DAG: stxsspx 1, 8, [[C2]]
+; CHECK: blr
+
+; CHECK-FISL-LABEL: @test4sp
+; CHECK-FISL-DAG: fmr [[F1:[0-9]+]], 1
+; CHECK-FISL-DAG: xsmaddasp [[F1]], 2, 3
+; CHECK-FISL-DAG: stxsspx 0, 0, 8
+; CHECK-FISL-DAG: fmr [[F1]], 1
+; CHECK-FISL-DAG: xsmaddasp [[F1]], 2, 4
+; CHECK-FISL-DAG: li [[C3:[0-9]+]], 4
+; CHECK-FISL-DAG: stxsspx 0, 8, [[C3]]
+; CHECK-FISL-DAG: xsmaddasp 0, 2, 3
+; CHECK-FISL-DAG: li [[C1:[0-9]+]], 12
+; CHECK-FISL-DAG: stxsspx 0, 8, [[C1]]
+; CHECK-FISL-DAG: xsmaddasp 1, 2, 5
+; CHECK-FISL-DAG: li [[C2:[0-9]+]], 8
+; CHECK-FISL-DAG: stxsspx 1, 8, [[C2]]
+; CHECK-FISL: blr
+}
+
+declare float @llvm.fma.f32(float, float, float) #0
diff --git a/llvm/test/MC/Disassembler/PowerPC/vsx.txt b/llvm/test/MC/Disassembler/PowerPC/vsx.txt
index 04b2eeb..6f4ba6f 100644
--- a/llvm/test/MC/Disassembler/PowerPC/vsx.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/vsx.txt
@@ -90,6 +90,12 @@
 # CHECK: xsmaddmdp 7, 63, 27
 0xf0 0xff 0xd9 0x4c
 
+# CHECK: xsmaddasp 7, 63, 27
+0xf0 0xff 0xd8 0x0c
+
+# CHECK: xsmaddmsp 7, 63, 27
+0xf0 0xff 0xd8 0x4c
+
 # CHECK: xsmaxdp 7, 63, 27
 0xf0 0xff 0xdd 0x04
 
@@ -102,6 +108,12 @@
 # CHECK: xsmsubmdp 7, 63, 27
 0xf0 0xff 0xd9 0xcc
 
+# CHECK: xsmsubasp 7, 63, 27
+0xf0 0xff 0xd8 0x8c
+
+# CHECK: xsmsubmsp 7, 63, 27
+0xf0 0xff 0xd8 0xcc
+
 # CHECK: xsmulsp 7, 63, 27
 0xf0 0xff 0xd8 0x84
 
@@ -126,6 +138,18 @@
 # CHECK: xsnmsubmdp 7, 63, 27
 0xf0 0xff 0xdd 0xcc
 
+# CHECK: xsnmaddasp 7, 63, 27
+0xf0 0xff 0xdc 0x0c
+
+# CHECK: xsnmaddmsp 7, 63, 27
+0xf0 0xff 0xdc 0x4c
+
+# CHECK: xsnmsubasp 7, 63, 27
+0xf0 0xff 0xdc 0x8c
+
+# CHECK: xsnmsubmsp 7, 63, 27
+0xf0 0xff 0xdc 0xcc
+
 # CHECK: xsrdpi 7, 27
 0xf0 0xe0 0xd9 0x24
 
diff --git a/llvm/test/MC/PowerPC/vsx.s b/llvm/test/MC/PowerPC/vsx.s
index 773fc9e..352fc51 100644
--- a/llvm/test/MC/PowerPC/vsx.s
+++ b/llvm/test/MC/PowerPC/vsx.s
@@ -95,6 +95,12 @@
 # CHECK-BE: xsmaddmdp 7, 63, 27                # encoding: [0xf0,0xff,0xd9,0x4c]
 # CHECK-LE: xsmaddmdp 7, 63, 27                # encoding: [0x4c,0xd9,0xff,0xf0]
             xsmaddmdp 7, 63, 27
+# CHECK-BE: xsmaddasp 7, 63, 27                # encoding: [0xf0,0xff,0xd8,0x0c]
+# CHECK-LE: xsmaddasp 7, 63, 27                # encoding: [0x0c,0xd8,0xff,0xf0]
+            xsmaddasp 7, 63, 27
+# CHECK-BE: xsmaddmsp 7, 63, 27                # encoding: [0xf0,0xff,0xd8,0x4c]
+# CHECK-LE: xsmaddmsp 7, 63, 27                # encoding: [0x4c,0xd8,0xff,0xf0]
+            xsmaddmsp 7, 63, 27
 # CHECK-BE: xsmaxdp 7, 63, 27                  # encoding: [0xf0,0xff,0xdd,0x04]
 # CHECK-LE: xsmaxdp 7, 63, 27                  # encoding: [0x04,0xdd,0xff,0xf0]
             xsmaxdp 7, 63, 27
@@ -107,6 +113,12 @@
 # CHECK-BE: xsmsubmdp 7, 63, 27                # encoding: [0xf0,0xff,0xd9,0xcc]
 # CHECK-LE: xsmsubmdp 7, 63, 27                # encoding: [0xcc,0xd9,0xff,0xf0]
             xsmsubmdp 7, 63, 27
+# CHECK-BE: xsmsubasp 7, 63, 27                # encoding: [0xf0,0xff,0xd8,0x8c]
+# CHECK-LE: xsmsubasp 7, 63, 27                # encoding: [0x8c,0xd8,0xff,0xf0]
+            xsmsubasp 7, 63, 27
+# CHECK-BE: xsmsubmsp 7, 63, 27                # encoding: [0xf0,0xff,0xd8,0xcc]
+# CHECK-LE: xsmsubmsp 7, 63, 27                # encoding: [0xcc,0xd8,0xff,0xf0]
+            xsmsubmsp 7, 63, 27
 # CHECK-BE: xsmulsp 7, 63, 27                  # encoding: [0xf0,0xff,0xd8,0x84]
 # CHECK-LE: xsmulsp 7, 63, 27                  # encoding: [0x84,0xd8,0xff,0xf0]
             xsmulsp 7, 63, 27
@@ -131,6 +143,18 @@
 # CHECK-BE: xsnmsubmdp 7, 63, 27               # encoding: [0xf0,0xff,0xdd,0xcc]
 # CHECK-LE: xsnmsubmdp 7, 63, 27               # encoding: [0xcc,0xdd,0xff,0xf0]
             xsnmsubmdp 7, 63, 27
+# CHECK-BE: xsnmaddasp 7, 63, 27               # encoding: [0xf0,0xff,0xdc,0x0c]
+# CHECK-LE: xsnmaddasp 7, 63, 27               # encoding: [0x0c,0xdc,0xff,0xf0]
+            xsnmaddasp 7, 63, 27
+# CHECK-BE: xsnmaddmsp 7, 63, 27               # encoding: [0xf0,0xff,0xdc,0x4c]
+# CHECK-LE: xsnmaddmsp 7, 63, 27               # encoding: [0x4c,0xdc,0xff,0xf0]
+            xsnmaddmsp 7, 63, 27
+# CHECK-BE: xsnmsubasp 7, 63, 27               # encoding: [0xf0,0xff,0xdc,0x8c]
+# CHECK-LE: xsnmsubasp 7, 63, 27               # encoding: [0x8c,0xdc,0xff,0xf0]
+            xsnmsubasp 7, 63, 27
+# CHECK-BE: xsnmsubmsp 7, 63, 27               # encoding: [0xf0,0xff,0xdc,0xcc]
+# CHECK-LE: xsnmsubmsp 7, 63, 27               # encoding: [0xcc,0xdc,0xff,0xf0]
+            xsnmsubmsp 7, 63, 27
 # CHECK-BE: xsrdpi 7, 27                       # encoding: [0xf0,0xe0,0xd9,0x24]
 # CHECK-LE: xsrdpi 7, 27                       # encoding: [0x24,0xd9,0xe0,0xf0]
             xsrdpi 7, 27
-- 
2.7.4