Register Target);
bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB);
bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB);
+ bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT);
bool ConvertVPSEL(MachineBasicBlock &MBB);
};
return !DeadInstructions.empty();
}
+bool MVEVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB,
+ MachineDominatorTree *DT) {
+ // Scan through the block, looking for instructions that use constants moves
+ // into VPR that are the negative of one another. These are expected to be
+ // COPY's to VCCRRegClass, from a t2MOVi or t2MOVi16. The last seen constant
+ // mask is kept it or and VPNOT's of it are added or reused as we scan through
+ // the function.
+ unsigned LastVPTImm = 0;
+ Register LastVPTReg = 0;
+ SmallSet<MachineInstr *, 4> DeadInstructions;
+
+ for (MachineInstr &Instr : MBB.instrs()) {
+ // Look for predicated MVE instructions.
+ int PIdx = llvm::findFirstVPTPredOperandIdx(Instr);
+ if (PIdx == -1)
+ continue;
+ Register VPR = Instr.getOperand(PIdx + 1).getReg();
+ if (!VPR.isVirtual())
+ continue;
+
+ // From that we are looking for an instruction like %11:vccr = COPY %9:rgpr.
+ MachineInstr *Copy = MRI->getVRegDef(VPR);
+ if (!Copy || Copy->getOpcode() != TargetOpcode::COPY ||
+ !Copy->getOperand(1).getReg().isVirtual() ||
+ MRI->getRegClass(Copy->getOperand(1).getReg()) == &ARM::VCCRRegClass) {
+ LastVPTReg = 0;
+ continue;
+ }
+ Register GPR = Copy->getOperand(1).getReg();
+
+ // Find the Immediate used by the copy.
+ auto getImm = [&](Register GPR) -> unsigned {
+ MachineInstr *Def = MRI->getVRegDef(GPR);
+ if (Def && (Def->getOpcode() == ARM::t2MOVi ||
+ Def->getOpcode() == ARM::t2MOVi16))
+ return Def->getOperand(1).getImm();
+ return -1U;
+ };
+ unsigned Imm = getImm(GPR);
+ if (Imm == -1U) {
+ LastVPTReg = 0;
+ continue;
+ }
+
+ unsigned NotImm = ~Imm & 0xffff;
+ if (LastVPTReg != 0 && LastVPTReg != VPR && LastVPTImm == Imm) {
+ Instr.getOperand(PIdx + 1).setReg(LastVPTReg);
+ if (MRI->use_empty(VPR)) {
+ DeadInstructions.insert(Copy);
+ if (MRI->hasOneUse(GPR))
+ DeadInstructions.insert(MRI->getVRegDef(GPR));
+ }
+ LLVM_DEBUG(dbgs() << "Reusing predicate: in " << Instr);
+ } else if (LastVPTReg != 0 && LastVPTImm == NotImm) {
+ // We have found the not of a previous constant. Create a VPNot of the
+ // earlier predicate reg and use it instead of the copy.
+ Register NewVPR = MRI->createVirtualRegister(&ARM::VCCRRegClass);
+ auto VPNot = BuildMI(MBB, &Instr, Instr.getDebugLoc(),
+ TII->get(ARM::MVE_VPNOT), NewVPR)
+ .addReg(LastVPTReg);
+ addUnpredicatedMveVpredNOp(VPNot);
+
+ // Use the new register and check if the def is now dead.
+ Instr.getOperand(PIdx + 1).setReg(NewVPR);
+ if (MRI->use_empty(VPR)) {
+ DeadInstructions.insert(Copy);
+ if (MRI->hasOneUse(GPR))
+ DeadInstructions.insert(MRI->getVRegDef(GPR));
+ }
+ LLVM_DEBUG(dbgs() << "Adding VPNot: " << *VPNot << " to replace use at "
+ << Instr);
+ VPR = NewVPR;
+ }
+
+ LastVPTImm = Imm;
+ LastVPTReg = VPR;
+ }
+
+ for (MachineInstr *DI : DeadInstructions)
+ DI->eraseFromParent();
+
+ return !DeadInstructions.empty();
+}
+
// Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a
// somewhat blunt approximation to allow tail predicated with vpsel
// instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly
}
for (MachineBasicBlock &MBB : Fn) {
+ Modified |= ReplaceConstByVPNOTs(MBB, DT);
Modified |= ReplaceVCMPsByVPNOTs(MBB);
Modified |= ReduceOldVCCRValueUses(MBB);
Modified |= ConvertVPSEL(MBB);
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r6, r7, lr}
; CHECK-NEXT: push {r4, r6, r7, lr}
-; CHECK-NEXT: .pad #8
-; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: movw r1, #52428
; CHECK-NEXT: vmsr p0, r1
-; CHECK-NEXT: movw r1, #13107
-; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstete
; CHECK-NEXT: vaddvt.s16 r12, q1
-; CHECK-NEXT: vmsr p0, r1
-; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vaddvt.s16 r2, q1
-; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddve.s16 r2, q1
; CHECK-NEXT: vaddvt.s16 r4, q0
-; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vaddvt.s16 r6, q0
+; CHECK-NEXT: vaddve.s16 r6, q0
; CHECK-NEXT: strd r6, r4, [r0]
; CHECK-NEXT: strd r2, r12, [r0, #8]
-; CHECK-NEXT: add sp, #8
; CHECK-NEXT: pop {r4, r6, r7, pc}
entry:
%0 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 13107)
; CHECK: @ %bb.0:
; CHECK-NEXT: movs r1, #1
; CHECK-NEXT: vmsr p0, r1
-; CHECK-NEXT: vpstt
-; CHECK-NEXT: vaddvat.s32 r0, q0
-; CHECK-NEXT: vaddvat.s32 r0, q1
-; CHECK-NEXT: movw r1, #65534
-; CHECK-NEXT: vmsr p0, r1
-; CHECK-NEXT: vpstt
+; CHECK-NEXT: vpsttee
; CHECK-NEXT: vaddvat.s32 r0, q0
; CHECK-NEXT: vaddvat.s32 r0, q1
+; CHECK-NEXT: vaddvae.s32 r0, q0
+; CHECK-NEXT: vaddvae.s32 r0, q1
; CHECK-NEXT: bx lr
%4 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 1)
%5 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %0, i32 0, <4 x i1> %4)
; CHECK: @ %bb.0:
; CHECK-NEXT: movw r1, #1234
; CHECK-NEXT: vmsr p0, r1
-; CHECK-NEXT: vpstt
-; CHECK-NEXT: vaddvat.s32 r0, q0
-; CHECK-NEXT: vaddvat.s32 r0, q1
-; CHECK-NEXT: movw r1, #64301
-; CHECK-NEXT: vmsr p0, r1
-; CHECK-NEXT: vpstt
+; CHECK-NEXT: vpsttee
; CHECK-NEXT: vaddvat.s32 r0, q0
; CHECK-NEXT: vaddvat.s32 r0, q1
+; CHECK-NEXT: vaddvae.s32 r0, q0
+; CHECK-NEXT: vaddvae.s32 r0, q1
; CHECK-NEXT: bx lr
%4 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 1234)
%5 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %0, i32 0, <4 x i1> %4)
define arm_aapcs_vfpcc i32 @const_mask_abab(<4 x i32> %0, <4 x i32> %1, i32 %2) {
; CHECK-LABEL: const_mask_abab:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .pad #8
-; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: movw r1, #1234
; CHECK-NEXT: vmsr p0, r1
-; CHECK-NEXT: movw r1, #64301
-; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstete
; CHECK-NEXT: vaddvat.s32 r0, q0
-; CHECK-NEXT: vmsr p0, r1
-; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vaddvat.s32 r0, q1
-; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddvae.s32 r0, q1
; CHECK-NEXT: vaddvat.s32 r0, q1
-; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vaddvat.s32 r0, q0
-; CHECK-NEXT: add sp, #8
+; CHECK-NEXT: vaddvae.s32 r0, q0
; CHECK-NEXT: bx lr
%4 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 1234)
%5 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %0, i32 0, <4 x i1> %4)
define arm_aapcs_vfpcc i32 @const_mask_abbreakab(<4 x i32> %0, <4 x i32> %1, i32 %2) {
; CHECK-LABEL: const_mask_abbreakab:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .pad #8
-; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: movw r1, #1234
; CHECK-NEXT: vmsr p0, r1
-; CHECK-NEXT: movw r1, #64301
-; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpste
; CHECK-NEXT: vaddvat.s32 r0, q0
-; CHECK-NEXT: vmsr p0, r1
-; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vaddvat.s32 r0, q1
+; CHECK-NEXT: vaddvae.s32 r0, q1
; CHECK-NEXT: vadd.i32 q1, q0, r0
-; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpnot
+; CHECK-NEXT: vpste
; CHECK-NEXT: vaddvat.s32 r0, q1
-; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vaddvat.s32 r0, q0
-; CHECK-NEXT: add sp, #8
+; CHECK-NEXT: vaddvae.s32 r0, q0
; CHECK-NEXT: bx lr
%4 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 1234)
%5 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %0, i32 0, <4 x i1> %4)
; CHECK-NEXT: vpstt
; CHECK-NEXT: vaddvat.s32 r0, q0
; CHECK-NEXT: vaddvat.s32 r0, q1
-; CHECK-NEXT: movw r1, #64301
; CHECK-NEXT: vadd.i32 q1, q0, r0
-; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpnot
; CHECK-NEXT: vpstt
; CHECK-NEXT: vaddvat.s32 r0, q1
; CHECK-NEXT: vaddvat.s32 r0, q0
define arm_aapcs_vfpcc i32 @const_mask_threepredabab(<4 x i32> %0, <4 x i32> %1, i32 %2) {
; CHECK-LABEL: const_mask_threepredabab:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .pad #8
-; CHECK-NEXT: sub sp, #8
+; CHECK-NEXT: .pad #4
+; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: movw r1, #1234
; CHECK-NEXT: vmsr p0, r1
-; CHECK-NEXT: movw r1, #64301
-; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
; CHECK-NEXT: vpst
; CHECK-NEXT: vaddvat.s32 r0, q0
-; CHECK-NEXT: vmsr p0, r1
-; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
+; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
+; CHECK-NEXT: vpnot
; CHECK-NEXT: vpst
; CHECK-NEXT: vaddvat.s32 r0, q1
; CHECK-NEXT: vpt.s32 gt, q1, q0
; CHECK-NEXT: vaddvat.s32 r0, q1
-; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vaddvat.s32 r0, q1
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vaddvat.s32 r0, q0
-; CHECK-NEXT: add sp, #8
+; CHECK-NEXT: vpste
+; CHECK-NEXT: vaddvat.s32 r0, q1
+; CHECK-NEXT: vaddvae.s32 r0, q0
+; CHECK-NEXT: add sp, #4
; CHECK-NEXT: bx lr
%4 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 1234)
%5 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %0, i32 0, <4 x i1> %4)