Register Target);
bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB);
bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB);
+ bool ConvertVPSEL(MachineBasicBlock &MBB);
};
char MVEVPTOptimisations::ID = 0;
}
for (MachineInstr *DeadInstruction : DeadInstructions)
- DeadInstruction->removeFromParent();
+ DeadInstruction->eraseFromParent();
return Modified;
}
}
for (MachineInstr *DeadInstruction : DeadInstructions)
- DeadInstruction->removeFromParent();
+ DeadInstruction->eraseFromParent();
+
+ return !DeadInstructions.empty();
+}
+
+// Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a
+// somewhat blunt approximation to allow tail predicated with vpsel
+// instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly
+// different semantics under tail predication. Until that is modelled we just
+// convert to a VMOVT (via a predicated VORR) instead.
+bool MVEVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) {
+ bool HasVCTP = false;
+ SmallVector<MachineInstr *, 4> DeadInstructions;
+
+ for (MachineInstr &MI : MBB.instrs()) {
+ if (isVCTP(&MI)) {
+ HasVCTP = true;
+ continue;
+ }
+
+ if (!HasVCTP || MI.getOpcode() != ARM::MVE_VPSEL)
+ continue;
+
+ MachineInstrBuilder MIBuilder =
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(ARM::MVE_VORR))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(1))
+ .addImm(ARMVCC::Then)
+ .add(MI.getOperand(4))
+ .add(MI.getOperand(2));
+ LLVM_DEBUG(dbgs() << "Replacing VPSEL: "; MI.dump();
+ dbgs() << " with VMOVT: "; MIBuilder.getInstr()->dump());
+ DeadInstructions.push_back(&MI);
+ }
+
+ for (MachineInstr *DeadInstruction : DeadInstructions)
+ DeadInstruction->eraseFromParent();
return !DeadInstructions.empty();
}
for (MachineBasicBlock &MBB : Fn) {
Modified |= ReplaceVCMPsByVPNOTs(MBB);
Modified |= ReduceOldVCCRValueUses(MBB);
+ Modified |= ConvertVPSEL(MBB);
}
LLVM_DEBUG(dbgs() << "**************************************\n");
; CHECK-LABEL: icmp_slt_v4i32_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.32 r0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.s32 gt, q1, q0
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
; CHECK-LABEL: icmp_slt_v8i16_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.16 r0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.s16 gt, q1, q0
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
; CHECK-LABEL: icmp_slt_v16i8_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.8 r0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.s8 gt, q1, q0
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
; CHECK-LABEL: icmp_sgt_v4i32_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.32 r0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.s32 gt, q0, q1
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
; CHECK-LABEL: icmp_sgt_v8i16_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.16 r0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.s16 gt, q0, q1
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
; CHECK-LABEL: icmp_sgt_v16i8_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.8 r0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.s8 gt, q0, q1
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
; CHECK-LABEL: icmp_ult_v4i32_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.32 r0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.u32 hi, q1, q0
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
; CHECK-LABEL: icmp_ult_v8i16_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.16 r0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.u16 hi, q1, q0
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
; CHECK-LABEL: icmp_ult_v16i8_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.8 r0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.u8 hi, q1, q0
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
; CHECK-LABEL: icmp_ugt_v4i32_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.32 r0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.u32 hi, q0, q1
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
; CHECK-LABEL: icmp_ugt_v8i16_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.16 r0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.u16 hi, q0, q1
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
; CHECK-LABEL: icmp_ugt_v16i8_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.8 r0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.u8 hi, q0, q1
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
; CHECK-LABEL: fcmp_fast_olt_v4f32_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.32 r0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.f32 gt, q1, q0
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
; CHECK-LABEL: fcmp_fast_olt_v8f16_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.16 r0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.f16 gt, q1, q0
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
; CHECK-LABEL: fcmp_fast_ogt_v4f32_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.32 r0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.f32 gt, q0, q1
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
; CHECK-LABEL: fcmp_fast_ogt_v8f16_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.16 r0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.f16 gt, q0, q1
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: movs r6, #0
-; CHECK-NEXT: mov r12, r1
; CHECK-NEXT: vidup.u32 q2, r6, #1
-; CHECK-NEXT: cmp r1, #4
-; CHECK-NEXT: it ge
-; CHECK-NEXT: movge.w r12, #4
-; CHECK-NEXT: sub.w r6, r1, r12
-; CHECK-NEXT: adds r6, #3
-; CHECK-NEXT: mov.w lr, #1
; CHECK-NEXT: adr r4, .LCPI0_0
; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: add.w lr, lr, r6, lsr #2
; CHECK-NEXT: vldrw.u32 q1, [r4]
; CHECK-NEXT: vmov.i32 q3, #0x4
; CHECK-NEXT: mov r12, r1
-; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB0_1: @ %do.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vctp.32 r12
-; CHECK-NEXT: sub.w r12, r12, #4
+; CHECK-NEXT: vldrw.u32 q4, [r0], #16
+; CHECK-NEXT: vcmp.f32 ge, q1, q4
; CHECK-NEXT: vpstt
-; CHECK-NEXT: vldrwt.u32 q4, [r0], #16
-; CHECK-NEXT: vcmpt.f32 ge, q1, q4
-; CHECK-NEXT: vpsel q0, q2, q0
-; CHECK-NEXT: vpsel q1, q4, q1
+; CHECK-NEXT: vmovt q1, q4
+; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vadd.i32 q2, q2, q3
-; CHECK-NEXT: le lr, .LBB0_1
+; CHECK-NEXT: letp lr, .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %do.end
; CHECK-NEXT: vldr s8, .LCPI0_1
; CHECK-NEXT: vdup.32 q3, r1
define void @vctp8(i32 %arg, <16 x i8> *%in, <16 x i8>* %out) {
; CHECK-LABEL: vctp8:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vldrw.u32 q1, [r1]
-; CHECK-NEXT: vctp.8 r0
; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: vpsel q0, q1, q0
+; CHECK-NEXT: vctp.8 r0
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmovt q0, q1
; CHECK-NEXT: vstrw.32 q0, [r2]
; CHECK-NEXT: bx lr
%pred = call <16 x i1> @llvm.arm.mve.vctp8(i32 %arg)
define void @vctp16(i32 %arg, <8 x i16> *%in, <8 x i16>* %out) {
; CHECK-LABEL: vctp16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vldrw.u32 q1, [r1]
-; CHECK-NEXT: vctp.16 r0
; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: vpsel q0, q1, q0
+; CHECK-NEXT: vctp.16 r0
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmovt q0, q1
; CHECK-NEXT: vstrw.32 q0, [r2]
; CHECK-NEXT: bx lr
%pred = call <8 x i1> @llvm.arm.mve.vctp16(i32 %arg)
define void @vctp32(i32 %arg, <4 x i32> *%in, <4 x i32>* %out) {
; CHECK-LABEL: vctp32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vldrw.u32 q1, [r1]
-; CHECK-NEXT: vctp.32 r0
; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: vpsel q0, q1, q0
+; CHECK-NEXT: vctp.32 r0
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmovt q0, q1
; CHECK-NEXT: vstrw.32 q0, [r2]
; CHECK-NEXT: bx lr
%pred = call <4 x i1> @llvm.arm.mve.vctp32(i32 %arg)