"spill on ppc"),
cl::Hidden, cl::init(100));
+// Copies/moves of physical accumulators are expensive operations
+// that should be avoided whenever possible. MMA instructions are
+// meant to be used in performance-sensitive computational kernels.
+// This option is provided, at least for the time being, to give the
+// user a tool to detect this expensive operation and either rework
+// their code or report a compiler bug if that turns out to be the
+// cause.
+#ifndef NDEBUG
+static cl::opt<bool>
+ReportAccMoves("ppc-report-acc-moves",
+ cl::desc("Emit information about accumulator register spills "
+ "and copies"),
+ cl::Hidden, cl::init(false));
+#endif
+
static unsigned offsetMinAlignForOpcode(unsigned OpC);
PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
MBB.erase(II);
}
+void PPCRegisterInfo::emitAccCopyInfo(MachineBasicBlock &MBB,
+ MCRegister DestReg, MCRegister SrcReg) {
+#ifdef NDEBUG
+ return;
+#else
+ if (ReportAccMoves) {
+ std::string Dest = PPC::ACCRCRegClass.contains(DestReg) ? "acc" : "uacc";
+ std::string Src = PPC::ACCRCRegClass.contains(SrcReg) ? "acc" : "uacc";
+ dbgs() << "Emitting copy from " << Src << " to " << Dest << ":\n";
+ MBB.dump();
+ }
+#endif
+}
+
+static void emitAccSpillRestoreInfo(MachineBasicBlock &MBB, bool IsPrimed,
+ bool IsRestore) {
+#ifdef NDEBUG
+ return;
+#else
+ if (ReportAccMoves) {
+ dbgs() << "Emitting " << (IsPrimed ? "acc" : "uacc") << " register "
+ << (IsRestore ? "restore" : "spill") << ":\n";
+ MBB.dump();
+ }
+#endif
+}
+
+/// lowerACCSpilling - Generate the code for spilling the accumulator register.
+/// Similarly to other spills/reloads that use pseudo-ops, we do not actually
+/// eliminate the FrameIndex here nor compute the stack offset. We simply
+/// create a real instruction with an FI and rely on eliminateFrameIndex to
+/// handle the FI elimination.
+void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const {
+ MachineInstr &MI = *II; // SPILL_ACC <SrcReg>, <offset>
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ Register SrcReg = MI.getOperand(0).getReg();
+ bool IsKilled = MI.getOperand(0).isKill();
+
+ bool IsPrimed = PPC::ACCRCRegClass.contains(SrcReg);
+ Register Reg =
+ PPC::VSRp0 + (SrcReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2;
+ bool IsLittleEndian = Subtarget.isLittleEndian();
+
+ emitAccSpillRestoreInfo(MBB, IsPrimed, false);
+
+ // De-prime the register being spilled, create two stores for the pair
+ // subregisters accounting for endianness and then re-prime the register if
+ // it isn't killed. This uses the Offset parameter to addFrameReference() to
+ // adjust the offset of the store that is within the 64-byte stack slot.
+ if (IsPrimed)
+ BuildMI(MBB, II, DL, TII.get(PPC::XXMFACC), SrcReg).addReg(SrcReg);
+ addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+ .addReg(Reg, getKillRegState(IsKilled)),
+ FrameIndex, IsLittleEndian ? 32 : 0);
+ addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+ .addReg(Reg + 1, getKillRegState(IsKilled)),
+ FrameIndex, IsLittleEndian ? 0 : 32);
+ if (IsPrimed && !IsKilled)
+ BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), SrcReg).addReg(SrcReg);
+
+ // Discard the pseudo instruction.
+ MBB.erase(II);
+}
+
+/// lowerACCRestore - Generate the code to restore the accumulator register.
+void PPCRegisterInfo::lowerACCRestore(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const {
+ MachineInstr &MI = *II; // <DestReg> = RESTORE_ACC <offset>
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ Register DestReg = MI.getOperand(0).getReg();
+ assert(MI.definesRegister(DestReg) &&
+ "RESTORE_ACC does not define its destination");
+
+ bool IsPrimed = PPC::ACCRCRegClass.contains(DestReg);
+ Register Reg =
+ PPC::VSRp0 + (DestReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2;
+ bool IsLittleEndian = Subtarget.isLittleEndian();
+
+ emitAccSpillRestoreInfo(MBB, IsPrimed, true);
+
+ // Create two loads for the pair subregisters accounting for endianness and
+ // then prime the accumulator register being restored.
+ addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), Reg),
+ FrameIndex, IsLittleEndian ? 32 : 0);
+ addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), Reg + 1),
+ FrameIndex, IsLittleEndian ? 0 : 32);
+ if (IsPrimed)
+ BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), DestReg).addReg(DestReg);
+
+ // Discard the pseudo instruction.
+ MBB.erase(II);
+}
+
bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
Register Reg, int &FrameIdx) const {
// For the nonvolatile condition registers (CR2, CR3, CR4) return true to
} else if (OpC == PPC::RESTORE_CRBIT) {
lowerCRBitRestore(II, FrameIndex);
return;
+ } else if (OpC == PPC::SPILL_ACC || OpC == PPC::SPILL_UACC) {
+ lowerACCSpilling(II, FrameIndex);
+ return;
+ } else if (OpC == PPC::RESTORE_ACC || OpC == PPC::RESTORE_UACC) {
+ lowerACCRestore(II, FrameIndex);
+ return;
}
// Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP).
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+
+declare <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1>, <16 x i8>, <16 x i8>)
+declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+declare void @foo()
+define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4, i8* %ptr) {
+; CHECK-LABEL: intrinsics1:
+; CHECK: .localentry intrinsics1, 1
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: mflr r0
+; CHECK-NEXT: .cfi_def_cfa_offset 176
+; CHECK-NEXT: .cfi_offset lr, 16
+; CHECK-NEXT: .cfi_offset r30, -16
+; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r0, 16(r1)
+; CHECK-NEXT: stdu r1, -176(r1)
+; CHECK-NEXT: li r3, 128
+; CHECK-NEXT: # kill: def $v5 killed $v5 killed $vsrp18 def $vsrp18
+; CHECK-NEXT: # kill: def $v4 killed $v4 killed $vsrp18 def $vsrp18
+; CHECK-NEXT: # kill: def $v3 killed $v3 killed $vsrp17 def $vsrp17
+; CHECK-NEXT: # kill: def $v2 killed $v2 killed $vsrp17 def $vsrp17
+; CHECK-NEXT: xxlor vs0, v2, v2
+; CHECK-NEXT: xxlor vs1, v3, v3
+; CHECK-NEXT: ld r30, 272(r1)
+; CHECK-NEXT: stxvp vsp34, r1(r3) # 32-byte Folded Spill
+; CHECK-NEXT: li r3, 96
+; CHECK-NEXT: xxlor vs2, v4, v4
+; CHECK-NEXT: xxlor vs3, v5, v5
+; CHECK-NEXT: stxvp vsp36, r1(r3) # 32-byte Folded Spill
+; CHECK-NEXT: xxmtacc acc0
+; CHECK-NEXT: li r3, 64
+; CHECK-NEXT: xvf16ger2pp acc0, v2, v4
+; CHECK-NEXT: xxmfacc acc0
+; CHECK-NEXT: stxvp vsp0, r1(r3)
+; CHECK-NEXT: li r3, 32
+; CHECK-NEXT: stxvp vsp2, r1(r3)
+; CHECK-NEXT: bl foo@notoc
+; CHECK-NEXT: li r3, 64
+; CHECK-NEXT: lxvp vsp0, r1(r3)
+; CHECK-NEXT: li r3, 32
+; CHECK-NEXT: lxvp vsp2, r1(r3)
+; CHECK-NEXT: li r3, 128
+; CHECK-NEXT: lxvp vsp34, r1(r3) # 32-byte Folded Reload
+; CHECK-NEXT: li r3, 96
+; CHECK-NEXT: lxvp vsp36, r1(r3) # 32-byte Folded Reload
+; CHECK-NEXT: xxmtacc acc0
+; CHECK-NEXT: xvf16ger2pp acc0, v2, v4
+; CHECK-NEXT: xxmfacc acc0
+; CHECK-NEXT: stxv vs0, 48(r30)
+; CHECK-NEXT: stxv vs1, 32(r30)
+; CHECK-NEXT: stxv vs2, 16(r30)
+; CHECK-NEXT: stxvx vs3, 0, r30
+; CHECK-NEXT: addi r1, r1, 176
+; CHECK-NEXT: ld r0, 16(r1)
+; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-NEXT: mtlr r0
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: intrinsics1:
+; CHECK-BE: # %bb.0:
+; CHECK-BE-NEXT: mflr r0
+; CHECK-BE-NEXT: std r0, 16(r1)
+; CHECK-BE-NEXT: stdu r1, -256(r1)
+; CHECK-BE-NEXT: .cfi_def_cfa_offset 256
+; CHECK-BE-NEXT: .cfi_offset lr, 16
+; CHECK-BE-NEXT: .cfi_offset r30, -16
+; CHECK-BE-NEXT: li r3, 208
+; CHECK-BE-NEXT: std r30, 240(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT: # kill: def $v5 killed $v5 killed $vsrp18 def $vsrp18
+; CHECK-BE-NEXT: # kill: def $v4 killed $v4 killed $vsrp18 def $vsrp18
+; CHECK-BE-NEXT: # kill: def $v3 killed $v3 killed $vsrp17 def $vsrp17
+; CHECK-BE-NEXT: # kill: def $v2 killed $v2 killed $vsrp17 def $vsrp17
+; CHECK-BE-NEXT: xxlor vs0, v2, v2
+; CHECK-BE-NEXT: ld r30, 368(r1)
+; CHECK-BE-NEXT: stxvp vsp34, r1(r3) # 32-byte Folded Spill
+; CHECK-BE-NEXT: xxlor vs1, v3, v3
+; CHECK-BE-NEXT: li r3, 176
+; CHECK-BE-NEXT: xxlor vs2, v4, v4
+; CHECK-BE-NEXT: xxlor vs3, v5, v5
+; CHECK-BE-NEXT: stxvp vsp36, r1(r3) # 32-byte Folded Spill
+; CHECK-BE-NEXT: xxmtacc acc0
+; CHECK-BE-NEXT: li r3, 112
+; CHECK-BE-NEXT: xvf16ger2pp acc0, v2, v4
+; CHECK-BE-NEXT: xxmfacc acc0
+; CHECK-BE-NEXT: stxvp vsp0, r1(r3)
+; CHECK-BE-NEXT: li r3, 144
+; CHECK-BE-NEXT: stxvp vsp2, r1(r3)
+; CHECK-BE-NEXT: bl foo
+; CHECK-BE-NEXT: nop
+; CHECK-BE-NEXT: li r3, 112
+; CHECK-BE-NEXT: lxvp vsp0, r1(r3)
+; CHECK-BE-NEXT: li r3, 144
+; CHECK-BE-NEXT: lxvp vsp2, r1(r3)
+; CHECK-BE-NEXT: li r3, 208
+; CHECK-BE-NEXT: lxvp vsp34, r1(r3) # 32-byte Folded Reload
+; CHECK-BE-NEXT: li r3, 176
+; CHECK-BE-NEXT: lxvp vsp36, r1(r3) # 32-byte Folded Reload
+; CHECK-BE-NEXT: xxmtacc acc0
+; CHECK-BE-NEXT: xvf16ger2pp acc0, v2, v4
+; CHECK-BE-NEXT: xxmfacc acc0
+; CHECK-BE-NEXT: stxv vs1, 16(r30)
+; CHECK-BE-NEXT: stxvx vs0, 0, r30
+; CHECK-BE-NEXT: stxv vs3, 48(r30)
+; CHECK-BE-NEXT: stxv vs2, 32(r30)
+; CHECK-BE-NEXT: ld r30, 240(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT: addi r1, r1, 256
+; CHECK-BE-NEXT: ld r0, 16(r1)
+; CHECK-BE-NEXT: mtlr r0
+; CHECK-BE-NEXT: blr
+ %1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4)
+ %2 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc3)
+ tail call void @foo()
+ %3 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %2, <16 x i8> %vc1, <16 x i8> %vc3)
+ %4 = bitcast i8* %ptr to <512 x i1>*
+ store <512 x i1> %3, <512 x i1>* %4, align 64
+ ret void
+}
; CHECK-BE-NEXT: stxv vs3, 48(r3)
; CHECK-BE-NEXT: stxv vs2, 32(r3)
; CHECK-BE-NEXT: blr
-; CHECK-O0-LABEL: ass_acc:
-; CHECK-O0: # %bb.0: # %entry
-; CHECK-BE-O0-LABEL: ass_acc:
-; CHECK-BE-O0: # %bb.0: # %entry
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc)
store <512 x i1> %0, <512 x i1>* %ptr, align 64
ret void
}
+declare <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1>, <16 x i8>, <16 x i8>)
+define void @testBranch(<512 x i1>* %ptr, <16 x i8> %vc, i32 %val) {
+; CHECK-LABEL: testBranch:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: cmplwi r7, 0
+; CHECK-NEXT: beq cr0, .LBB7_2
+; CHECK-NEXT: # %bb.1: # %if.then
+; CHECK-NEXT: xxsetaccz acc0
+; CHECK-NEXT: b .LBB7_3
+; CHECK-NEXT: .LBB7_2: # %if.else
+; CHECK-NEXT: lxv vs1, 32(r3)
+; CHECK-NEXT: lxv vs0, 48(r3)
+; CHECK-NEXT: lxv vs3, 0(r3)
+; CHECK-NEXT: lxv vs2, 16(r3)
+; CHECK-NEXT: xxmtacc acc0
+; CHECK-NEXT: xvi4ger8pp acc0, v2, v2
+; CHECK-NEXT: .LBB7_3: # %if.end
+; CHECK-NEXT: xxmfacc acc0
+; CHECK-NEXT: xxmtacc acc0
+; CHECK-NEXT: xxmfacc acc0
+; CHECK-NEXT: stxv vs0, 48(r3)
+; CHECK-NEXT: stxv vs1, 32(r3)
+; CHECK-NEXT: stxv vs2, 16(r3)
+; CHECK-NEXT: stxv vs3, 0(r3)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testBranch:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: cmplwi r7, 0
+; CHECK-BE-NEXT: beq cr0, .LBB7_2
+; CHECK-BE-NEXT: # %bb.1: # %if.then
+; CHECK-BE-NEXT: xxsetaccz acc0
+; CHECK-BE-NEXT: b .LBB7_3
+; CHECK-BE-NEXT: .LBB7_2: # %if.else
+; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: lxv vs0, 0(r3)
+; CHECK-BE-NEXT: lxv vs3, 48(r3)
+; CHECK-BE-NEXT: lxv vs2, 32(r3)
+; CHECK-BE-NEXT: xxmtacc acc0
+; CHECK-BE-NEXT: xvi4ger8pp acc0, v2, v2
+; CHECK-BE-NEXT: .LBB7_3: # %if.end
+; CHECK-BE-NEXT: xxmfacc acc0
+; CHECK-BE-NEXT: xxmtacc acc0
+; CHECK-BE-NEXT: xxmfacc acc0
+; CHECK-BE-NEXT: stxv vs1, 16(r3)
+; CHECK-BE-NEXT: stxv vs0, 0(r3)
+; CHECK-BE-NEXT: stxv vs3, 48(r3)
+; CHECK-BE-NEXT: stxv vs2, 32(r3)
+; CHECK-BE-NEXT: blr
+entry:
+ %tobool = icmp eq i32 %val, 0
+ br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+ %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+ br label %if.end
+
+if.else:
+ %1 = load <512 x i1>, <512 x i1>* %ptr, align 64
+ %2 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
+ br label %if.end
+
+if.end:
+ %vq1.0 = phi <512 x i1> [ %0, %if.then ], [ %2, %if.else ]
+ store <512 x i1> %vq1.0, <512 x i1>* %ptr, align 64
+ ret void
+}
+
+; The following test cases check that the xxsetaccz instruction is correctly rematerialized
+declare <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1>, <16 x i8>, <16 x i8>)
+declare <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1>, <16 x i8>, <16 x i8>)
+declare <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1>, <16 x i8>, <16 x i8>)
+
+define void @testcse(<512 x i1>* %res, <16 x i8> %vc) {
+; CHECK-LABEL: testcse:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xxsetaccz acc0
+; CHECK-NEXT: xvf32gerpp acc0, v2, v2
+; CHECK-NEXT: xxmfacc acc0
+; CHECK-NEXT: stxv vs0, 48(r3)
+; CHECK-NEXT: stxv vs1, 32(r3)
+; CHECK-NEXT: stxv vs2, 16(r3)
+; CHECK-NEXT: stxv vs3, 0(r3)
+; CHECK-NEXT: stxv vs0, 112(r3)
+; CHECK-NEXT: stxv vs1, 96(r3)
+; CHECK-NEXT: stxv vs2, 80(r3)
+; CHECK-NEXT: stxv vs3, 64(r3)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testcse:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: xxsetaccz acc0
+; CHECK-BE-NEXT: xvf32gerpp acc0, v2, v2
+; CHECK-BE-NEXT: xxmfacc acc0
+; CHECK-BE-NEXT: stxv vs1, 16(r3)
+; CHECK-BE-NEXT: stxv vs0, 0(r3)
+; CHECK-BE-NEXT: stxv vs3, 48(r3)
+; CHECK-BE-NEXT: stxv vs2, 32(r3)
+; CHECK-BE-NEXT: stxv vs1, 80(r3)
+; CHECK-BE-NEXT: stxv vs0, 64(r3)
+; CHECK-BE-NEXT: stxv vs3, 112(r3)
+; CHECK-BE-NEXT: stxv vs2, 96(r3)
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+ %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+ %2 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
+ %3 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
+ %4 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 0
+ %5 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 1
+ store <512 x i1> %2, <512 x i1>* %4, align 64
+ store <512 x i1> %3, <512 x i1>* %5, align 64
+ ret void
+}
+
+define void @testcse2(<512 x i1>* %res, <16 x i8> %vc) {
+; CHECK-LABEL: testcse2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xxsetaccz acc0
+; CHECK-NEXT: xxsetaccz acc1
+; CHECK-NEXT: xvf32gerpp acc1, v2, v2
+; CHECK-NEXT: xvf32gerpn acc0, v2, v2
+; CHECK-NEXT: xxmfacc acc1
+; CHECK-NEXT: xxmfacc acc0
+; CHECK-NEXT: stxv vs4, 48(r3)
+; CHECK-NEXT: stxv vs5, 32(r3)
+; CHECK-NEXT: stxv vs6, 16(r3)
+; CHECK-NEXT: stxv vs7, 0(r3)
+; CHECK-NEXT: stxv vs0, 112(r3)
+; CHECK-NEXT: stxv vs1, 96(r3)
+; CHECK-NEXT: stxv vs2, 80(r3)
+; CHECK-NEXT: stxv vs3, 64(r3)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testcse2:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: xxsetaccz acc0
+; CHECK-BE-NEXT: xxsetaccz acc1
+; CHECK-BE-NEXT: xvf32gerpp acc1, v2, v2
+; CHECK-BE-NEXT: xvf32gerpn acc0, v2, v2
+; CHECK-BE-NEXT: xxmfacc acc1
+; CHECK-BE-NEXT: xxmfacc acc0
+; CHECK-BE-NEXT: stxv vs5, 16(r3)
+; CHECK-BE-NEXT: stxv vs4, 0(r3)
+; CHECK-BE-NEXT: stxv vs7, 48(r3)
+; CHECK-BE-NEXT: stxv vs6, 32(r3)
+; CHECK-BE-NEXT: stxv vs1, 80(r3)
+; CHECK-BE-NEXT: stxv vs0, 64(r3)
+; CHECK-BE-NEXT: stxv vs3, 112(r3)
+; CHECK-BE-NEXT: stxv vs2, 96(r3)
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+ %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+ %2 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
+ %3 = call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
+ %4 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 0
+ %5 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 1
+ store <512 x i1> %2, <512 x i1>* %4, align 64
+ store <512 x i1> %3, <512 x i1>* %5, align 64
+ ret void
+}
+
+define void @testcse3(<512 x i1>* %res, <16 x i8> %vc) {
+; CHECK-LABEL: testcse3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xxsetaccz acc0
+; CHECK-NEXT: xxsetaccz acc1
+; CHECK-NEXT: xvf32gerpp acc1, v2, v2
+; CHECK-NEXT: xvf32gerpn acc0, v2, v2
+; CHECK-NEXT: xxmfacc acc1
+; CHECK-NEXT: xxmfacc acc0
+; CHECK-NEXT: stxv vs4, 48(r3)
+; CHECK-NEXT: stxv vs5, 32(r3)
+; CHECK-NEXT: stxv vs6, 16(r3)
+; CHECK-NEXT: stxv vs7, 0(r3)
+; CHECK-NEXT: stxv vs0, 112(r3)
+; CHECK-NEXT: stxv vs1, 96(r3)
+; CHECK-NEXT: stxv vs2, 80(r3)
+; CHECK-NEXT: stxv vs3, 64(r3)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testcse3:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: xxsetaccz acc0
+; CHECK-BE-NEXT: xxsetaccz acc1
+; CHECK-BE-NEXT: xvf32gerpp acc1, v2, v2
+; CHECK-BE-NEXT: xvf32gerpn acc0, v2, v2
+; CHECK-BE-NEXT: xxmfacc acc1
+; CHECK-BE-NEXT: xxmfacc acc0
+; CHECK-BE-NEXT: stxv vs5, 16(r3)
+; CHECK-BE-NEXT: stxv vs4, 0(r3)
+; CHECK-BE-NEXT: stxv vs7, 48(r3)
+; CHECK-BE-NEXT: stxv vs6, 32(r3)
+; CHECK-BE-NEXT: stxv vs1, 80(r3)
+; CHECK-BE-NEXT: stxv vs0, 64(r3)
+; CHECK-BE-NEXT: stxv vs3, 112(r3)
+; CHECK-BE-NEXT: stxv vs2, 96(r3)
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+ %1 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
+ %2 = call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
+ %3 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 0
+ %4 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 1
+ store <512 x i1> %1, <512 x i1>* %3, align 64
+ store <512 x i1> %2, <512 x i1>* %4, align 64
+ ret void
+}
+
+define void @testcse4(<512 x i1>* %res, i32 %lim, <16 x i8>* %vc) {
+; CHECK-LABEL: testcse4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: cmpwi r4, 1
+; CHECK-NEXT: bltlr cr0
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: clrldi r4, r4, 32
+; CHECK-NEXT: li r6, 0
+; CHECK-NEXT: mtctr r4
+; CHECK-NEXT: li r4, 0
+; CHECK-NEXT: .p2align 4
+; CHECK-NEXT: .LBB11_2: # %for.body
+; CHECK-NEXT: #
+; CHECK-NEXT: rldic r7, r6, 4, 28
+; CHECK-NEXT: addi r6, r6, 6
+; CHECK-NEXT: xxsetaccz acc2
+; CHECK-NEXT: xxsetaccz acc1
+; CHECK-NEXT: lxvx vs0, r5, r7
+; CHECK-NEXT: add r7, r5, r7
+; CHECK-NEXT: lxv vs1, 16(r7)
+; CHECK-NEXT: xvf32gerpp acc2, vs0, vs1
+; CHECK-NEXT: lxv vs0, 32(r7)
+; CHECK-NEXT: lxv vs1, 48(r7)
+; CHECK-NEXT: xxmfacc acc2
+; CHECK-NEXT: xvf32gerpn acc1, vs0, vs1
+; CHECK-NEXT: lxv vs12, 64(r7)
+; CHECK-NEXT: lxv vs13, 80(r7)
+; CHECK-NEXT: rldic r7, r4, 6, 26
+; CHECK-NEXT: addi r4, r4, 3
+; CHECK-NEXT: xxsetaccz acc0
+; CHECK-NEXT: xxmfacc acc1
+; CHECK-NEXT: xvf32gernp acc0, vs12, vs13
+; CHECK-NEXT: stxvx vs11, r3, r7
+; CHECK-NEXT: add r7, r3, r7
+; CHECK-NEXT: xxmfacc acc0
+; CHECK-NEXT: stxv vs8, 48(r7)
+; CHECK-NEXT: stxv vs9, 32(r7)
+; CHECK-NEXT: stxv vs10, 16(r7)
+; CHECK-NEXT: stxv vs4, 112(r7)
+; CHECK-NEXT: stxv vs5, 96(r7)
+; CHECK-NEXT: stxv vs6, 80(r7)
+; CHECK-NEXT: stxv vs7, 64(r7)
+; CHECK-NEXT: stxv vs0, 176(r7)
+; CHECK-NEXT: stxv vs1, 160(r7)
+; CHECK-NEXT: stxv vs2, 144(r7)
+; CHECK-NEXT: stxv vs3, 128(r7)
+; CHECK-NEXT: bdnz .LBB11_2
+; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testcse4:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: cmpwi r4, 1
+; CHECK-BE-NEXT: bltlr cr0
+; CHECK-BE-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-BE-NEXT: clrldi r4, r4, 32
+; CHECK-BE-NEXT: li r6, 0
+; CHECK-BE-NEXT: mtctr r4
+; CHECK-BE-NEXT: li r4, 0
+; CHECK-BE-NEXT: .p2align 4
+; CHECK-BE-NEXT: .LBB11_2: # %for.body
+; CHECK-BE-NEXT: #
+; CHECK-BE-NEXT: rldic r7, r6, 4, 28
+; CHECK-BE-NEXT: addi r6, r6, 6
+; CHECK-BE-NEXT: xxsetaccz acc2
+; CHECK-BE-NEXT: xxsetaccz acc1
+; CHECK-BE-NEXT: lxvx vs0, r5, r7
+; CHECK-BE-NEXT: add r7, r5, r7
+; CHECK-BE-NEXT: lxv vs1, 16(r7)
+; CHECK-BE-NEXT: xvf32gerpp acc2, vs0, vs1
+; CHECK-BE-NEXT: lxv vs0, 32(r7)
+; CHECK-BE-NEXT: lxv vs1, 48(r7)
+; CHECK-BE-NEXT: xxmfacc acc2
+; CHECK-BE-NEXT: xvf32gerpn acc1, vs0, vs1
+; CHECK-BE-NEXT: lxv vs12, 64(r7)
+; CHECK-BE-NEXT: lxv vs13, 80(r7)
+; CHECK-BE-NEXT: rldic r7, r4, 6, 26
+; CHECK-BE-NEXT: addi r4, r4, 3
+; CHECK-BE-NEXT: xxsetaccz acc0
+; CHECK-BE-NEXT: xxmfacc acc1
+; CHECK-BE-NEXT: xvf32gernp acc0, vs12, vs13
+; CHECK-BE-NEXT: stxvx vs8, r3, r7
+; CHECK-BE-NEXT: add r7, r3, r7
+; CHECK-BE-NEXT: xxmfacc acc0
+; CHECK-BE-NEXT: stxv vs9, 16(r7)
+; CHECK-BE-NEXT: stxv vs11, 48(r7)
+; CHECK-BE-NEXT: stxv vs10, 32(r7)
+; CHECK-BE-NEXT: stxv vs5, 80(r7)
+; CHECK-BE-NEXT: stxv vs4, 64(r7)
+; CHECK-BE-NEXT: stxv vs7, 112(r7)
+; CHECK-BE-NEXT: stxv vs6, 96(r7)
+; CHECK-BE-NEXT: stxv vs1, 144(r7)
+; CHECK-BE-NEXT: stxv vs0, 128(r7)
+; CHECK-BE-NEXT: stxv vs3, 176(r7)
+; CHECK-BE-NEXT: stxv vs2, 160(r7)
+; CHECK-BE-NEXT: bdnz .LBB11_2
+; CHECK-BE-NEXT: # %bb.3: # %for.cond.cleanup
+; CHECK-BE-NEXT: blr
+entry:
+ %cmp55 = icmp sgt i32 %lim, 0
+ br i1 %cmp55, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %lim to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body, %for.body.preheader
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+ %1 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+ %2 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+ %3 = trunc i64 %indvars.iv to i32
+ %mul = mul nsw i32 %3, 6
+ %idxprom = zext i32 %mul to i64
+ %arrayidx = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom
+ %4 = load <16 x i8>, <16 x i8>* %arrayidx, align 16
+ %add2 = or i32 %mul, 1
+ %idxprom3 = zext i32 %add2 to i64
+ %arrayidx4 = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom3
+ %5 = load <16 x i8>, <16 x i8>* %arrayidx4, align 16
+ %6 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %4, <16 x i8> %5)
+ %add6 = add nuw nsw i32 %mul, 2
+ %idxprom7 = zext i32 %add6 to i64
+ %arrayidx8 = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom7
+ %7 = load <16 x i8>, <16 x i8>* %arrayidx8, align 16
+ %add10 = add nuw nsw i32 %mul, 3
+ %idxprom11 = zext i32 %add10 to i64
+ %arrayidx12 = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom11
+ %8 = load <16 x i8>, <16 x i8>* %arrayidx12, align 16
+ %9 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %1, <16 x i8> %7, <16 x i8> %8)
+ %add14 = add nuw nsw i32 %mul, 4
+ %idxprom15 = zext i32 %add14 to i64
+ %arrayidx16 = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom15
+ %10 = load <16 x i8>, <16 x i8>* %arrayidx16, align 16
+ %add18 = add nuw nsw i32 %mul, 5
+ %idxprom19 = zext i32 %add18 to i64
+ %arrayidx20 = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom19
+ %11 = load <16 x i8>, <16 x i8>* %arrayidx20, align 16
+ %12 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %2, <16 x i8> %10, <16 x i8> %11)
+ %mul21 = mul i64 %indvars.iv, 3
+ %idx.ext = and i64 %mul21, 4294967295
+ %add.ptr = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 %idx.ext
+ store <512 x i1> %6, <512 x i1>* %add.ptr, align 64
+ %add.ptr26 = getelementptr inbounds <512 x i1>, <512 x i1>* %add.ptr, i64 1
+ store <512 x i1> %9, <512 x i1>* %add.ptr26, align 64
+ %add.ptr30 = getelementptr inbounds <512 x i1>, <512 x i1>* %add.ptr, i64 2
+ store <512 x i1> %12, <512 x i1>* %add.ptr30, align 64
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare i32 @testRedundantPrimeUnprimeF()
+define void @testRedundantPrimeUnprime(<512 x i1>* %dst, <16 x i8> %vc) nounwind {
+; CHECK-LABEL: testRedundantPrimeUnprime:
+; CHECK: .localentry testRedundantPrimeUnprime, 1
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: mflr r0
+; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r0, 16(r1)
+; CHECK-NEXT: stdu r1, -112(r1)
+; CHECK-NEXT: xxsetaccz acc0
+; CHECK-NEXT: xxsetaccz acc1
+; CHECK-NEXT: mr r30, r3
+; CHECK-NEXT: xxmfacc acc0
+; CHECK-NEXT: stxv vs0, 48(r3)
+; CHECK-NEXT: stxv vs1, 32(r3)
+; CHECK-NEXT: stxv vs2, 16(r3)
+; CHECK-NEXT: stxv vs3, 0(r3)
+; CHECK-NEXT: xvf32gerpp acc1, v2, v2
+; CHECK-NEXT: li r3, 64
+; CHECK-NEXT: xxmfacc acc1
+; CHECK-NEXT: stxvp vsp4, r1(r3)
+; CHECK-NEXT: li r3, 32
+; CHECK-NEXT: stxvp vsp6, r1(r3)
+; CHECK-NEXT: bl testRedundantPrimeUnprimeF@notoc
+; CHECK-NEXT: li r3, 64
+; CHECK-NEXT: lxvp vsp0, r1(r3)
+; CHECK-NEXT: li r3, 32
+; CHECK-NEXT: lxvp vsp2, r1(r3)
+; CHECK-NEXT: xxmtacc acc0
+; CHECK-NEXT: xxmfacc acc0
+; CHECK-NEXT: stxv vs0, 112(r30)
+; CHECK-NEXT: stxv vs1, 96(r30)
+; CHECK-NEXT: stxv vs2, 80(r30)
+; CHECK-NEXT: stxv vs3, 64(r30)
+; CHECK-NEXT: addi r1, r1, 112
+; CHECK-NEXT: ld r0, 16(r1)
+; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-NEXT: mtlr r0
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testRedundantPrimeUnprime:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: mflr r0
+; CHECK-BE-NEXT: std r0, 16(r1)
+; CHECK-BE-NEXT: stdu r1, -192(r1)
+; CHECK-BE-NEXT: xxsetaccz acc0
+; CHECK-BE-NEXT: xxsetaccz acc1
+; CHECK-BE-NEXT: std r30, 176(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT: mr r30, r3
+; CHECK-BE-NEXT: xxmfacc acc0
+; CHECK-BE-NEXT: stxv vs1, 16(r3)
+; CHECK-BE-NEXT: stxv vs0, 0(r3)
+; CHECK-BE-NEXT: stxv vs3, 48(r3)
+; CHECK-BE-NEXT: stxv vs2, 32(r3)
+; CHECK-BE-NEXT: xvf32gerpp acc1, v2, v2
+; CHECK-BE-NEXT: li r3, 112
+; CHECK-BE-NEXT: xxmfacc acc1
+; CHECK-BE-NEXT: stxvp vsp4, r1(r3)
+; CHECK-BE-NEXT: li r3, 144
+; CHECK-BE-NEXT: stxvp vsp6, r1(r3)
+; CHECK-BE-NEXT: bl testRedundantPrimeUnprimeF
+; CHECK-BE-NEXT: nop
+; CHECK-BE-NEXT: li r3, 112
+; CHECK-BE-NEXT: lxvp vsp0, r1(r3)
+; CHECK-BE-NEXT: li r3, 144
+; CHECK-BE-NEXT: lxvp vsp2, r1(r3)
+; CHECK-BE-NEXT: xxmtacc acc0
+; CHECK-BE-NEXT: xxmfacc acc0
+; CHECK-BE-NEXT: stxv vs3, 112(r30)
+; CHECK-BE-NEXT: stxv vs2, 96(r30)
+; CHECK-BE-NEXT: stxv vs1, 80(r30)
+; CHECK-BE-NEXT: stxv vs0, 64(r30)
+; CHECK-BE-NEXT: ld r30, 176(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT: addi r1, r1, 192
+; CHECK-BE-NEXT: ld r0, 16(r1)
+; CHECK-BE-NEXT: mtlr r0
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+ store <512 x i1> %0, <512 x i1>* %dst, align 64
+ %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
+ %call = tail call signext i32 bitcast (i32 ()* @testRedundantPrimeUnprimeF to i32 ()*)()
+ %add.ptr1 = getelementptr inbounds <512 x i1>, <512 x i1>* %dst, i64 1
+ store <512 x i1> %1, <512 x i1>* %add.ptr1, align 64
+ ret void
+}
+
+declare <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1>, <256 x i1>, <16 x i8>, i32, i32)
+declare <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1>, <256 x i1>, <16 x i8>)