#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
/// intrinsic. E.g., check that the loop induction variable and the element
/// count are of the form we expect, and also perform overflow checks for
/// the new expressions that are created.
- bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount);
+ const SCEV *IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount);
/// Insert the intrinsic to represent the effect of tail predication.
- void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount);
+ void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *Start);
/// Rematerialize the iteration count in exit blocks, which enables
/// ARMLowOverheadLoops to better optimise away loop update statements inside
// (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount
// 3) The IV must be an induction phi with an increment equal to the
// vector width.
-bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
- Value *TripCount) {
+const SCEV *MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
+ Value *TripCount) {
bool ForceTailPredication =
EnableTailPredication == TailPredication::ForceEnabledNoReductions ||
EnableTailPredication == TailPredication::ForceEnabled;
Value *ElemCount = ActiveLaneMask->getOperand(1);
bool Changed = false;
if (!L->makeLoopInvariant(ElemCount, Changed))
- return false;
+ return nullptr;
auto *EC= SE->getSCEV(ElemCount);
auto *TC = SE->getSCEV(TripCount);
cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements();
if (VectorWidth != 2 && VectorWidth != 4 && VectorWidth != 8 &&
VectorWidth != 16)
- return false;
+ return nullptr;
ConstantInt *ConstElemCount = nullptr;
// 1) Smoke tests that the original scalar loop TripCount (TC) belongs to
// processed by the loop, so we will refer to that from this point on.
if (!SE->isLoopInvariant(EC, L)) {
LLVM_DEBUG(dbgs() << "ARM TP: element count must be loop invariant.\n");
- return false;
+ return nullptr;
+ }
+
+ // 2) Find out if IV is an induction phi. Note that we can't use Loop
+ // helpers here to get the induction variable, because the hardware loop is
+ // no longer in loopsimplify form, and also the hwloop intrinsic uses a
+ // different counter. Using SCEV, we check that the induction is of the
+ // form i = i + 4, where the increment must be equal to the VectorWidth.
+ auto *IV = ActiveLaneMask->getOperand(0);
+ auto *IVExpr = SE->getSCEV(IV);
+ auto *AddExpr = dyn_cast<SCEVAddRecExpr>(IVExpr);
+
+ if (!AddExpr) {
+ LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump());
+ return nullptr;
+ }
+ // Check that this AddRec is associated with this loop.
+ if (AddExpr->getLoop() != L) {
+ LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n");
+ return nullptr;
+ }
+ auto *Step = dyn_cast<SCEVConstant>(AddExpr->getOperand(1));
+ if (!Step) {
+ LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: ";
+ AddExpr->getOperand(1)->dump());
+ return nullptr;
+ }
+ auto StepValue = Step->getValue()->getSExtValue();
+ if (VectorWidth != StepValue) {
+ LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue
+ << " doesn't match vector width " << VectorWidth << "\n");
+ return nullptr;
}
if ((ConstElemCount = dyn_cast<ConstantInt>(ElemCount))) {
if (!TC) {
LLVM_DEBUG(dbgs() << "ARM TP: Constant tripcount expected in "
"set.loop.iterations\n");
- return false;
+ return nullptr;
}
// Calculate 2 tripcount values and check that they are consistent with
LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: "
<< TC1 << " from set.loop.iterations, and "
<< TC2 << " from get.active.lane.mask\n");
- return false;
+ return nullptr;
}
} else if (!ForceTailPredication) {
- // 2) We need to prove that the sub expression that we create in the
+ // 3) We need to prove that the sub expression that we create in the
// tail-predicated loop body, which calculates the remaining elements to be
// processed, is non-negative, i.e. it doesn't overflow:
//
//
auto *VW = SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth));
// ElementCount + (VW-1):
+ auto *Start = AddExpr->getStart();
auto *ECPlusVWMinus1 = SE->getAddExpr(EC,
SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1)));
// Prevent unused variable warnings with TC
(void)TC;
- LLVM_DEBUG(
+ LLVM_DEBUG({
dbgs() << "ARM TP: Analysing overflow behaviour for:\n";
- dbgs() << "ARM TP: - TripCount = "; TC->dump();
- dbgs() << "ARM TP: - ElemCount = "; EC->dump();
+ dbgs() << "ARM TP: - TripCount = " << *TC << "\n";
+ dbgs() << "ARM TP: - ElemCount = " << *EC << "\n";
+ dbgs() << "ARM TP: - Start = " << *Start << "\n";
+ dbgs() << "ARM TP: - BETC = " << *SE->getBackedgeTakenCount(L) << "\n";
dbgs() << "ARM TP: - VecWidth = " << VectorWidth << "\n";
- dbgs() << "ARM TP: - (ElemCount+VW-1) / VW = "; Ceil->dump();
- );
+ dbgs() << "ARM TP: - (ElemCount+VW-1) / VW = " << *Ceil << "\n";
+ });
// As an example, almost all the tripcount expressions (produced by the
// vectoriser) look like this:
//
- // TC = ((-4 + (4 * ((3 + %N) /u 4))<nuw>) /u 4)
+ // TC = ((-4 + (4 * ((3 + %N) /u 4))<nuw> - start) /u 4)
//
// and "ElementCount + (VW-1) / VW":
//
// Check for equality of TC and Ceil by calculating SCEV expression
// TC - Ceil and test it for zero.
//
- const SCEV *Sub =
- SE->getMinusSCEV(SE->getBackedgeTakenCount(L),
- SE->getUDivExpr(SE->getAddExpr(SE->getMulExpr(Ceil, VW),
- SE->getNegativeSCEV(VW)),
- VW));
+ const SCEV *Div = SE->getUDivExpr(
+ SE->getAddExpr(SE->getMulExpr(Ceil, VW), SE->getNegativeSCEV(VW),
+ SE->getNegativeSCEV(Start)),
+ VW);
+ const SCEV *Sub = SE->getMinusSCEV(SE->getBackedgeTakenCount(L), Div);
+ LLVM_DEBUG(dbgs() << "ARM TP: - Sub = "; Sub->dump());
// Use context sensitive facts about the path to the loop to refine. This
// comes up as the backedge taken count can incorporate context sensitive
// reasoning, and our RHS just above doesn't.
Sub = SE->applyLoopGuards(Sub, L);
+ LLVM_DEBUG(dbgs() << "ARM TP: - (Guarded) = "; Sub->dump());
if (!Sub->isZero()) {
LLVM_DEBUG(dbgs() << "ARM TP: possible overflow in sub expression.\n");
- return false;
+ return nullptr;
}
}
- // 3) Find out if IV is an induction phi. Note that we can't use Loop
- // helpers here to get the induction variable, because the hardware loop is
- // no longer in loopsimplify form, and also the hwloop intrinsic uses a
- // different counter. Using SCEV, we check that the induction is of the
- // form i = i + 4, where the increment must be equal to the VectorWidth.
- auto *IV = ActiveLaneMask->getOperand(0);
- auto *IVExpr = SE->getSCEV(IV);
- auto *AddExpr = dyn_cast<SCEVAddRecExpr>(IVExpr);
-
- if (!AddExpr) {
- LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump());
- return false;
+ // Check that the start value is a multiple of the VectorWidth.
+ // TODO: This could do with a method to check if the scev is a multiple of
+ // VectorWidth. For the moment we just check for constants, muls and unknowns
+ // (which use MaskedValueIsZero and seems to be the most common).
+ if (auto *BaseC = dyn_cast<SCEVConstant>(AddExpr->getStart())) {
+ if (BaseC->getAPInt().urem(VectorWidth) == 0)
+ return SE->getMinusSCEV(EC, BaseC);
+ } else if (auto *BaseV = dyn_cast<SCEVUnknown>(AddExpr->getStart())) {
+ Type *Ty = BaseV->getType();
+ APInt Mask = APInt::getLowBitsSet(Ty->getPrimitiveSizeInBits(),
+ Log2_64(VectorWidth));
+ if (MaskedValueIsZero(BaseV->getValue(), Mask,
+ L->getHeader()->getModule()->getDataLayout()))
+ return SE->getMinusSCEV(EC, BaseV);
+ } else if (auto *BaseMul = dyn_cast<SCEVMulExpr>(AddExpr->getStart())) {
+ if (auto *BaseC = dyn_cast<SCEVConstant>(BaseMul->getOperand(0)))
+ if (BaseC->getAPInt().urem(VectorWidth) == 0)
+ return SE->getMinusSCEV(EC, BaseC);
+ if (auto *BaseC = dyn_cast<SCEVConstant>(BaseMul->getOperand(1)))
+ if (BaseC->getAPInt().urem(VectorWidth) == 0)
+ return SE->getMinusSCEV(EC, BaseC);
}
- // Check that this AddRec is associated with this loop.
- if (AddExpr->getLoop() != L) {
- LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n");
- return false;
- }
- auto *Base = dyn_cast<SCEVConstant>(AddExpr->getOperand(0));
- if (!Base || !Base->isZero()) {
- LLVM_DEBUG(dbgs() << "ARM TP: induction base is not 0\n");
- return false;
- }
- auto *Step = dyn_cast<SCEVConstant>(AddExpr->getOperand(1));
- if (!Step) {
- LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: ";
- AddExpr->getOperand(1)->dump());
- return false;
- }
- auto StepValue = Step->getValue()->getSExtValue();
- if (VectorWidth == StepValue)
- return true;
-
- LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue
- << " doesn't match vector width " << VectorWidth << "\n");
- return false;
+ LLVM_DEBUG(
+ dbgs() << "ARM TP: induction base is not know to be a multiple of VF: "
+ << *AddExpr->getOperand(0) << "\n");
+ return nullptr;
}
void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
- Value *TripCount) {
+ Value *Start) {
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
Module *M = L->getHeader()->getModule();
Type *Ty = IntegerType::get(M->getContext(), 32);
// Insert a phi to count the number of elements processed by the loop.
Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI());
PHINode *Processed = Builder.CreatePHI(Ty, 2);
- Processed->addIncoming(ActiveLaneMask->getOperand(1), L->getLoopPreheader());
+ Processed->addIncoming(Start, L->getLoopPreheader());
// Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and
// thus represent the effect of tail predication.
LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: "
<< *ActiveLaneMask << "\n");
- if (!IsSafeActiveMask(ActiveLaneMask, TripCount)) {
+ const SCEV *StartSCEV = IsSafeActiveMask(ActiveLaneMask, TripCount);
+ if (!StartSCEV) {
LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n");
return false;
}
- LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n");
- InsertVCTPIntrinsic(ActiveLaneMask, TripCount);
+ LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP. Start is " << *StartSCEV
+ << "\n");
+ SCEVExpander Expander(*SE, L->getHeader()->getModule()->getDataLayout(),
+ "start");
+ Instruction *Ins = L->getLoopPreheader()->getTerminator();
+ Value *Start = Expander.expandCodeFor(StartSCEV, StartSCEV->getType(), Ins);
+ LLVM_DEBUG(dbgs() << "ARM TP: Created start value " << *Start << "\n");
+ InsertVCTPIntrinsic(ActiveLaneMask, Start);
}
// Remove dead instructions and now dead phis.
; ENABLED-NEXT: .LBB0_4: @ %for.body
; ENABLED-NEXT: @ =>This Loop Header: Depth=1
; ENABLED-NEXT: @ Child Loop BB0_6 Depth 2
-; ENABLED-NEXT: cmp r2, r8
-; ENABLED-NEXT: ble .LBB0_2
+; ENABLED-NEXT: cmp r8, r2
+; ENABLED-NEXT: bge .LBB0_2
; ENABLED-NEXT: @ %bb.5: @ %vector.ph
; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1
; ENABLED-NEXT: bic r0, r9, #3
; NOREDUCTIONS-NEXT: .LBB0_4: @ %for.body
; NOREDUCTIONS-NEXT: @ =>This Loop Header: Depth=1
; NOREDUCTIONS-NEXT: @ Child Loop BB0_6 Depth 2
-; NOREDUCTIONS-NEXT: cmp r2, r8
-; NOREDUCTIONS-NEXT: ble .LBB0_2
+; NOREDUCTIONS-NEXT: cmp r8, r2
+; NOREDUCTIONS-NEXT: bge .LBB0_2
; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph
; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1
; NOREDUCTIONS-NEXT: bic r0, r9, #3
; CHECK-NEXT: .pad #12
; CHECK-NEXT: sub sp, #12
; CHECK-NEXT: cmp r3, #1
-; CHECK-NEXT: strd r0, r1, [sp, #4] @ 8-byte Folded Spill
+; CHECK-NEXT: strd r0, r1, [sp] @ 8-byte Folded Spill
+; CHECK-NEXT: mov r1, r3
+; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: blt .LBB4_12
; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph
-; CHECK-NEXT: ldr r7, [sp, #48]
-; CHECK-NEXT: mov r0, r3
-; CHECK-NEXT: ldr.w r9, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: adds r3, r2, #3
-; CHECK-NEXT: mov.w r11, #0
-; CHECK-NEXT: mov r10, r2
-; CHECK-NEXT: uxth.w r12, r7
-; CHECK-NEXT: adr r7, .LCPI4_0
-; CHECK-NEXT: vldrw.u32 q0, [r7]
-; CHECK-NEXT: str r2, [sp] @ 4-byte Spill
+; CHECK-NEXT: ldr r1, [sp, #48]
+; CHECK-NEXT: add.w r12, r2, #3
+; CHECK-NEXT: ldr.w r11, [sp] @ 4-byte Reload
+; CHECK-NEXT: mov.w r10, #0
+; CHECK-NEXT: mov r8, r2
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: uxth r3, r1
; CHECK-NEXT: b .LBB4_4
; CHECK-NEXT: .LBB4_2: @ in Loop: Header=BB4_4 Depth=1
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: .LBB4_3: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1
-; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: lsrs r1, r6, #16
-; CHECK-NEXT: subs r3, #1
-; CHECK-NEXT: add.w r9, r9, #2
-; CHECK-NEXT: sub.w r10, r10, #1
-; CHECK-NEXT: strh.w r1, [r7, r11, lsl #1]
-; CHECK-NEXT: add.w r11, r11, #1
-; CHECK-NEXT: cmp r11, r0
+; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: lsrs r2, r6, #16
+; CHECK-NEXT: sub.w r12, r12, #1
+; CHECK-NEXT: add.w r11, r11, #2
+; CHECK-NEXT: sub.w r8, r8, #1
+; CHECK-NEXT: strh.w r2, [r7, r10, lsl #1]
+; CHECK-NEXT: add.w r10, r10, #1
+; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: cmp r10, r2
+; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: beq .LBB4_12
; CHECK-NEXT: .LBB4_4: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB4_8 Depth 2
; CHECK-NEXT: @ Child Loop BB4_11 Depth 2
-; CHECK-NEXT: cmp r2, r11
+; CHECK-NEXT: cmp r2, r10
; CHECK-NEXT: ble .LBB4_2
; CHECK-NEXT: @ %bb.5: @ %vector.main.loop.iter.check
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1
-; CHECK-NEXT: sub.w r8, r2, r11
-; CHECK-NEXT: cmp.w r8, #8
+; CHECK-NEXT: sub.w r4, r2, r10
+; CHECK-NEXT: cmp r4, #8
; CHECK-NEXT: bhs .LBB4_7
; CHECK-NEXT: @ %bb.6: @ in Loop: Header=BB4_4 Depth=1
; CHECK-NEXT: movs r6, #0
-; CHECK-NEXT: movs r7, #0
+; CHECK-NEXT: mov.w r9, #0
; CHECK-NEXT: b .LBB4_10
; CHECK-NEXT: .LBB4_7: @ %vector.ph
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1
-; CHECK-NEXT: bic r7, r10, #7
-; CHECK-NEXT: movs r1, #1
-; CHECK-NEXT: subs r7, #8
+; CHECK-NEXT: bic r2, r8, #7
+; CHECK-NEXT: movs r7, #1
+; CHECK-NEXT: subs r2, #8
+; CHECK-NEXT: bic r9, r4, #7
; CHECK-NEXT: movs r6, #0
-; CHECK-NEXT: mov r5, r9
-; CHECK-NEXT: add.w lr, r1, r7, lsr #3
-; CHECK-NEXT: bic r7, r8, #7
-; CHECK-NEXT: ldr r4, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: mov r5, r11
+; CHECK-NEXT: add.w lr, r7, r2, lsr #3
+; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
; CHECK-NEXT: .LBB4_8: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vldrh.u16 q1, [r4], #16
-; CHECK-NEXT: vldrh.u16 q2, [r5], #16
-; CHECK-NEXT: rsb.w r1, r12, #0
-; CHECK-NEXT: vmullb.s16 q3, q2, q1
-; CHECK-NEXT: vmullt.s16 q1, q2, q1
-; CHECK-NEXT: vshl.s32 q3, r1
-; CHECK-NEXT: vshl.s32 q1, r1
-; CHECK-NEXT: vaddva.u32 r6, q3
-; CHECK-NEXT: vaddva.u32 r6, q1
+; CHECK-NEXT: vldrh.u16 q0, [r2], #16
+; CHECK-NEXT: vldrh.u16 q1, [r5], #16
+; CHECK-NEXT: rsbs r7, r3, #0
+; CHECK-NEXT: vmullb.s16 q2, q1, q0
+; CHECK-NEXT: vmullt.s16 q0, q1, q0
+; CHECK-NEXT: vshl.s32 q2, r7
+; CHECK-NEXT: vshl.s32 q0, r7
+; CHECK-NEXT: vaddva.u32 r6, q2
+; CHECK-NEXT: vaddva.u32 r6, q0
; CHECK-NEXT: le lr, .LBB4_8
; CHECK-NEXT: @ %bb.9: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1
-; CHECK-NEXT: cmp r8, r7
+; CHECK-NEXT: cmp r4, r9
; CHECK-NEXT: beq .LBB4_3
; CHECK-NEXT: .LBB4_10: @ %vec.epilog.ph
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1
-; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: add.w r5, r7, r11
-; CHECK-NEXT: bic lr, r3, #3
-; CHECK-NEXT: mov r2, r0
-; CHECK-NEXT: add.w r4, r1, r7, lsl #1
-; CHECK-NEXT: add.w r5, r1, r5, lsl #1
-; CHECK-NEXT: sub.w r1, lr, r7
-; CHECK-NEXT: movs r0, #1
-; CHECK-NEXT: subs r1, #4
-; CHECK-NEXT: add.w lr, r0, r1, lsr #2
-; CHECK-NEXT: mov r0, r2
-; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
+; CHECK-NEXT: add.w r2, r9, r10
+; CHECK-NEXT: sub.w r5, r8, r9
+; CHECK-NEXT: add.w r7, r1, r9, lsl #1
+; CHECK-NEXT: add.w r2, r1, r2, lsl #1
+; CHECK-NEXT: dlstp.32 lr, r5
; CHECK-NEXT: .LBB4_11: @ %vec.epilog.vector.body
; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vqadd.u32 q1, q0, r7
-; CHECK-NEXT: vdup.32 q2, r8
-; CHECK-NEXT: rsb.w r1, r12, #0
-; CHECK-NEXT: vptt.u32 hi, q2, q1
-; CHECK-NEXT: vldrht.s32 q1, [r4], #8
-; CHECK-NEXT: vldrht.s32 q2, [r5], #8
-; CHECK-NEXT: adds r7, #4
-; CHECK-NEXT: vmul.i32 q1, q2, q1
-; CHECK-NEXT: vshl.s32 q1, r1
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vaddvat.u32 r6, q1
-; CHECK-NEXT: le lr, .LBB4_11
+; CHECK-NEXT: rsbs r4, r3, #0
+; CHECK-NEXT: vldrh.s32 q0, [r7], #8
+; CHECK-NEXT: vldrh.s32 q1, [r2], #8
+; CHECK-NEXT: vmul.i32 q0, q1, q0
+; CHECK-NEXT: vshl.s32 q0, r4
+; CHECK-NEXT: vaddva.u32 r6, q0
+; CHECK-NEXT: letp lr, .LBB4_11
; CHECK-NEXT: b .LBB4_3
; CHECK-NEXT: .LBB4_12: @ %for.end17
; CHECK-NEXT: add sp, #12
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.13:
-; CHECK-NEXT: .LCPI4_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 3 @ 0x3
entry:
%conv = sext i16 %Ls to i32
%cmp31 = icmp sgt i16 %Ls, 0
define arm_aapcs_vfpcc void @start12(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
; CHECK-LABEL: start12:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: cmp r3, #1
-; CHECK-NEXT: blt .LBB0_3
-; CHECK-NEXT: @ %bb.1: @ %vector.ph
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r4, pc}
+; CHECK-NEXT: .LBB0_1: @ %vector.ph
; CHECK-NEXT: vmov r12, s0
-; CHECK-NEXT: adds r4, r3, #3
-; CHECK-NEXT: bic r4, r4, #3
-; CHECK-NEXT: adr r5, .LCPI0_0
-; CHECK-NEXT: sub.w lr, r4, #16
-; CHECK-NEXT: movs r4, #1
+; CHECK-NEXT: subs r3, #12
; CHECK-NEXT: adds r0, #48
; CHECK-NEXT: adds r1, #48
-; CHECK-NEXT: add.w lr, r4, lr, lsr #2
; CHECK-NEXT: adds r2, #48
-; CHECK-NEXT: vldrw.u32 q0, [r5]
-; CHECK-NEXT: movs r4, #12
-; CHECK-NEXT: vdup.32 q1, r3
+; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vqadd.u32 q2, q0, r4
-; CHECK-NEXT: adds r4, #4
-; CHECK-NEXT: vptt.u32 hi, q1, q2
-; CHECK-NEXT: vldrwt.u32 q2, [r1], #16
-; CHECK-NEXT: vldrwt.u32 q3, [r0], #16
-; CHECK-NEXT: vfmas.f32 q3, q2, r12
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vstrwt.32 q3, [r2], #16
-; CHECK-NEXT: le lr, .LBB0_2
-; CHECK-NEXT: .LBB0_3: @ %for.cond.cleanup
-; CHECK-NEXT: pop {r4, r5, r7, pc}
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.4:
-; CHECK-NEXT: .LCPI0_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 3 @ 0x3
+; CHECK-NEXT: vldrw.u32 q0, [r1], #16
+; CHECK-NEXT: vldrw.u32 q1, [r0], #16
+; CHECK-NEXT: vfmas.f32 q1, q0, r12
+; CHECK-NEXT: vstrw.32 q1, [r2], #16
+; CHECK-NEXT: letp lr, .LBB0_2
+; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
+; CHECK-NEXT: pop {r4, pc}
entry:
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
define arm_aapcs_vfpcc void @startSmod4(i32 %S, ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
; CHECK-LABEL: startSmod4:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, lr}
-; CHECK-NEXT: push {r4, r5, r6, lr}
-; CHECK-NEXT: ldr r6, [sp, #16]
-; CHECK-NEXT: cmp r6, #1
-; CHECK-NEXT: blt .LBB3_3
-; CHECK-NEXT: @ %bb.1: @ %vector.ph
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: ldr.w lr, [sp, #8]
+; CHECK-NEXT: cmp.w lr, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r4, pc}
+; CHECK-NEXT: .LBB3_1: @ %vector.ph
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: mvn r4, #12
; CHECK-NEXT: and.w r4, r4, r0, lsl #2
-; CHECK-NEXT: bic r0, r0, #3
; CHECK-NEXT: add r1, r4
; CHECK-NEXT: add r2, r4
; CHECK-NEXT: add r3, r4
-; CHECK-NEXT: adds r4, r6, #3
-; CHECK-NEXT: bic r4, r4, #3
-; CHECK-NEXT: movs r5, #1
-; CHECK-NEXT: subs r4, r4, r0
-; CHECK-NEXT: vdup.32 q1, r6
-; CHECK-NEXT: subs r4, #4
-; CHECK-NEXT: add.w lr, r5, r4, lsr #2
-; CHECK-NEXT: adr r4, .LCPI3_0
-; CHECK-NEXT: vldrw.u32 q0, [r4]
+; CHECK-NEXT: sub.w r0, lr, #4
+; CHECK-NEXT: dlstp.32 lr, r0
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vqadd.u32 q2, q0, r0
-; CHECK-NEXT: adds r0, #4
-; CHECK-NEXT: vptt.u32 hi, q1, q2
-; CHECK-NEXT: vldrwt.u32 q2, [r2], #16
-; CHECK-NEXT: vldrwt.u32 q3, [r1], #16
-; CHECK-NEXT: vfmas.f32 q3, q2, r12
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vstrwt.32 q3, [r3], #16
-; CHECK-NEXT: le lr, .LBB3_2
-; CHECK-NEXT: .LBB3_3: @ %for.cond.cleanup
-; CHECK-NEXT: pop {r4, r5, r6, pc}
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.4:
-; CHECK-NEXT: .LCPI3_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 3 @ 0x3
+; CHECK-NEXT: vldrw.u32 q0, [r2], #16
+; CHECK-NEXT: vldrw.u32 q1, [r1], #16
+; CHECK-NEXT: vfmas.f32 q1, q0, r12
+; CHECK-NEXT: vstrw.32 q1, [r3], #16
+; CHECK-NEXT: letp lr, .LBB3_2
+; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
+; CHECK-NEXT: pop {r4, pc}
entry:
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup