// for now LSR only handles innermost loops).
if (AR->getLoop() != L) {
// If the AddRec exists, consider it's register free and leave it alone.
- if (isExistingPhi(AR, *SE))
+ if (isExistingPhi(AR, *SE) && !TTI->shouldFavorPostInc())
return;
// It is bad to allow LSR for current loop to add induction variables
void NarrowSearchSpaceByCollapsingUnrolledCode();
void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
+ void NarrowSearchSpaceByFilterPostInc();
void NarrowSearchSpaceByDeletingCostlyFormulas();
void NarrowSearchSpaceByPickingWinnerRegs();
void NarrowSearchSpaceUsingHeuristics();
});
}
+/// If we are over the complexity limit, filter out any post-inc prefering
+/// variables to only post-inc values.
+void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
+ if (!TTI.shouldFavorPostInc())
+ return;
+ if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+ return;
+
+ LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
+ "Narrowing the search space by choosing the lowest "
+ "register Formula for PostInc Uses.\n");
+
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+
+ if (LU.Kind != LSRUse::Address)
+ continue;
+ if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
+ !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
+ continue;
+
+ size_t MinRegs = std::numeric_limits<size_t>::max();
+ for (const Formula &F : LU.Formulae)
+ MinRegs = std::min(F.getNumRegs(), MinRegs);
+
+ bool Any = false;
+ for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
+ ++FIdx) {
+ Formula &F = LU.Formulae[FIdx];
+ if (F.getNumRegs() > MinRegs) {
+ LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
+ dbgs() << "\n");
+ LU.DeleteFormula(F);
+ --FIdx;
+ --NumForms;
+ Any = true;
+ }
+ }
+ if (Any)
+ LU.RecomputeRegs(LUIdx, RegUses);
+
+ if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+ break;
+ }
+
+ LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+}
+
/// The function delete formulas with high registers number expectation.
/// Assuming we don't know the value of each formula (already delete
/// all inefficient), generate probability of not selecting for each
NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
if (FilterSameScaledReg)
NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
+ NarrowSearchSpaceByFilterPostInc();
if (LSRExpNarrow)
NarrowSearchSpaceByDeletingCostlyFormulas();
else
define dso_local void @test_nested(half* noalias nocapture %pInT1, half* noalias nocapture readonly %pOutT1, half* noalias nocapture readonly %pPRT_in, half* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l, half *%ina) local_unnamed_addr #0 {
; CHECK-LABEL: test_nested:
; CHECK: @ %bb.0: @ %for.body.us.preheader
-; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT: push {r4, r5, r6, r7, lr}
-; CHECK-NEXT: ldrd lr, r12, [sp, #20]
+; CHECK-NEXT: .save {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: ldrd lr, r12, [sp, #16]
; CHECK-NEXT: lsl.w r3, r12, #1
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB14_1: @ %for.body.us
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB14_2 Depth 2
; CHECK-NEXT: ldrh r4, [r1]
-; CHECK-NEXT: mov r5, r12
+; CHECK-NEXT: mov r5, r2
+; CHECK-NEXT: mov r6, r12
; CHECK-NEXT: vdup.16 q0, r4
-; CHECK-NEXT: movs r4, #0
+; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: .LBB14_2: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: adds r6, r0, r4
-; CHECK-NEXT: adds r7, r2, r4
-; CHECK-NEXT: vldrw.u32 q1, [r7]
-; CHECK-NEXT: vldrw.u32 q2, [r6]
-; CHECK-NEXT: adds r4, #16
-; CHECK-NEXT: subs r5, #8
+; CHECK-NEXT: vldrw.u32 q1, [r5], #16
+; CHECK-NEXT: vldrw.u32 q2, [r4]
+; CHECK-NEXT: subs r6, #8
; CHECK-NEXT: vfms.f16 q2, q1, q0
-; CHECK-NEXT: vstrw.32 q2, [r6]
+; CHECK-NEXT: vstrb.8 q2, [r4], #16
; CHECK-NEXT: bne .LBB14_2
; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1
; CHECK-NEXT: adds r1, #2
; CHECK-NEXT: le lr, .LBB14_1
; CHECK-NEXT: @ %bb.4: @ %for.end14
-; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT: pop {r4, r5, r6, pc}
for.body.us.preheader:
%in = load half, half* %ina
%cmp = icmp sgt i32 %numRows, 0
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
-; CHECK-NEXT: ldrh.w r10, [r0]
-; CHECK-NEXT: mov r5, r3
+; CHECK-NEXT: ldrh r5, [r0]
; CHECK-NEXT: ldr.w r12, [r0, #4]
-; CHECK-NEXT: sub.w r6, r10, #1
+; CHECK-NEXT: subs r6, r5, #1
; CHECK-NEXT: cmp r6, #3
; CHECK-NEXT: bhi .LBB15_6
; CHECK-NEXT: @ %bb.1: @ %if.then
; CHECK-NEXT: ldr r7, [r0, #8]
; CHECK-NEXT: add.w r4, r12, r6, lsl #1
-; CHECK-NEXT: lsr.w lr, r5, #2
-; CHECK-NEXT: ldrh r3, [r7, #6]
+; CHECK-NEXT: lsr.w lr, r3, #2
+; CHECK-NEXT: ldrh.w r8, [r7, #6]
; CHECK-NEXT: ldrh.w r9, [r7, #4]
-; CHECK-NEXT: ldrh.w r8, [r7, #2]
+; CHECK-NEXT: ldrh r6, [r7, #2]
; CHECK-NEXT: ldrh r7, [r7]
; CHECK-NEXT: wls lr, lr, .LBB15_5
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
-; CHECK-NEXT: strd r5, r10, [sp, #8] @ 8-byte Folded Spill
-; CHECK-NEXT: bic r5, r5, #3
-; CHECK-NEXT: str r0, [sp] @ 4-byte Spill
-; CHECK-NEXT: mov.w r10, #0
-; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: str r5, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: bic r5, r3, #3
+; CHECK-NEXT: add.w r10, r12, #2
+; CHECK-NEXT: str r5, [sp] @ 4-byte Spill
; CHECK-NEXT: add.w r5, r2, r5, lsl #1
; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: .LBB15_3: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add.w r11, r1, r10
-; CHECK-NEXT: add.w r5, r4, r10
+; CHECK-NEXT: vldrw.u32 q0, [r1], #8
+; CHECK-NEXT: sub.w r11, r10, #2
+; CHECK-NEXT: add.w r5, r10, #2
+; CHECK-NEXT: vstrb.8 q0, [r4], #8
; CHECK-NEXT: vldrw.u32 q0, [r11]
-; CHECK-NEXT: vstrw.32 q0, [r5]
-; CHECK-NEXT: add.w r5, r12, r10
-; CHECK-NEXT: vldrw.u32 q0, [r5]
-; CHECK-NEXT: adds r6, r5, #2
-; CHECK-NEXT: vldrw.u32 q1, [r6]
+; CHECK-NEXT: vldrw.u32 q1, [r10]
; CHECK-NEXT: vmul.f16 q0, q0, r7
-; CHECK-NEXT: vfma.f16 q0, q1, r8
-; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
-; CHECK-NEXT: adds r5, #6
-; CHECK-NEXT: vfma.f16 q0, q1, r9
+; CHECK-NEXT: vfma.f16 q0, q1, r6
; CHECK-NEXT: vldrw.u32 q1, [r5]
-; CHECK-NEXT: add.w r5, r2, r10
+; CHECK-NEXT: vfma.f16 q0, q1, r9
+; CHECK-NEXT: vldrw.u32 q1, [r10, #4]
; CHECK-NEXT: add.w r10, r10, #8
-; CHECK-NEXT: vfma.f16 q0, q1, r3
-; CHECK-NEXT: vstrw.32 q0, [r5]
+; CHECK-NEXT: vfma.f16 q0, q1, r8
+; CHECK-NEXT: vstrb.8 q0, [r2], #8
; CHECK-NEXT: le lr, .LBB15_3
; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit
-; CHECK-NEXT: add r4, r10
-; CHECK-NEXT: add.w r12, r12, r0, lsl #1
-; CHECK-NEXT: add.w r1, r1, r0, lsl #1
-; CHECK-NEXT: ldm.w sp, {r0, r2, r5, r10} @ 16-byte Folded Reload
+; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r5, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: add.w r12, r12, r2, lsl #1
+; CHECK-NEXT: add.w r1, r1, r2, lsl #1
+; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: .LBB15_5: @ %while.end
-; CHECK-NEXT: and r6, r5, #3
+; CHECK-NEXT: and lr, r3, #3
; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vctp.16 r6
+; CHECK-NEXT: vctp.16 lr
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrht.16 q0, [r4]
; CHECK-NEXT: vldrw.u32 q0, [r12]
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: add.w r1, r12, #6
; CHECK-NEXT: vmul.f16 q0, q0, r7
-; CHECK-NEXT: vfma.f16 q0, q1, r8
+; CHECK-NEXT: vfma.f16 q0, q1, r6
; CHECK-NEXT: vldrw.u32 q1, [r12, #4]
; CHECK-NEXT: vfma.f16 q0, q1, r9
; CHECK-NEXT: vldrw.u32 q1, [r1]
-; CHECK-NEXT: vfma.f16 q0, q1, r3
+; CHECK-NEXT: vfma.f16 q0, q1, r8
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrht.16 q0, [r2]
; CHECK-NEXT: ldr.w r12, [r0, #4]
; CHECK-NEXT: .LBB15_6: @ %if.end
-; CHECK-NEXT: add.w r0, r12, r5, lsl #1
-; CHECK-NEXT: lsr.w lr, r10, #2
+; CHECK-NEXT: add.w r0, r12, r3, lsl #1
+; CHECK-NEXT: lsr.w lr, r5, #2
; CHECK-NEXT: wls lr, lr, .LBB15_10
; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader
-; CHECK-NEXT: bic r2, r10, #3
+; CHECK-NEXT: bic r2, r5, #3
+; CHECK-NEXT: adds r1, r2, r3
; CHECK-NEXT: mov r3, r12
-; CHECK-NEXT: adds r1, r2, r5
; CHECK-NEXT: add.w r1, r12, r1, lsl #1
; CHECK-NEXT: .LBB15_8: @ %while.body51
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, r2, lsl #1
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: .LBB15_10: @ %while.end55
-; CHECK-NEXT: ands r1, r10, #3
+; CHECK-NEXT: ands r1, r5, #3
; CHECK-NEXT: beq .LBB15_12
; CHECK-NEXT: @ %bb.11: @ %if.then59
; CHECK-NEXT: vldrw.u32 q0, [r0]
define dso_local void @test_nested(float* noalias nocapture %pInT1, float* noalias nocapture readonly %pOutT1, float* noalias nocapture readonly %pPRT_in, float* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l, float %in) local_unnamed_addr #0 {
; CHECK-LABEL: test_nested:
; CHECK: @ %bb.0: @ %for.body.us.preheader
-; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT: push {r4, r5, r6, r7, lr}
-; CHECK-NEXT: ldrd lr, r12, [sp, #20]
+; CHECK-NEXT: .save {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: ldrd lr, r12, [sp, #16]
; CHECK-NEXT: lsl.w r3, r12, #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB14_1: @ %for.body.us
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB14_2 Depth 2
; CHECK-NEXT: ldr r4, [r1]
-; CHECK-NEXT: mov r5, r12
+; CHECK-NEXT: mov r5, r2
+; CHECK-NEXT: mov r6, r12
; CHECK-NEXT: vdup.32 q0, r4
-; CHECK-NEXT: movs r4, #0
+; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: .LBB14_2: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: adds r6, r0, r4
-; CHECK-NEXT: adds r7, r2, r4
-; CHECK-NEXT: vldrw.u32 q1, [r7]
-; CHECK-NEXT: vldrw.u32 q2, [r6]
-; CHECK-NEXT: adds r4, #16
-; CHECK-NEXT: subs r5, #4
+; CHECK-NEXT: vldrw.u32 q1, [r5], #16
+; CHECK-NEXT: vldrw.u32 q2, [r4]
+; CHECK-NEXT: subs r6, #4
; CHECK-NEXT: vfms.f32 q2, q1, q0
-; CHECK-NEXT: vstrw.32 q2, [r6]
+; CHECK-NEXT: vstrb.8 q2, [r4], #16
; CHECK-NEXT: bne .LBB14_2
; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1
; CHECK-NEXT: adds r1, #4
; CHECK-NEXT: le lr, .LBB14_1
; CHECK-NEXT: @ %bb.4: @ %for.end14
-; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT: pop {r4, r5, r6, pc}
for.body.us.preheader:
%cmp = icmp sgt i32 %numRows, 0
tail call void @llvm.assume(i1 %cmp)
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
-; CHECK-NEXT: ldrh r5, [r0]
-; CHECK-NEXT: mov r6, r3
+; CHECK-NEXT: .pad #8
+; CHECK-NEXT: sub sp, #8
+; CHECK-NEXT: ldrh.w r9, [r0]
+; CHECK-NEXT: mov r11, r1
; CHECK-NEXT: ldr.w r12, [r0, #4]
-; CHECK-NEXT: sub.w lr, r5, #1
-; CHECK-NEXT: cmp.w lr, #3
+; CHECK-NEXT: sub.w r1, r9, #1
+; CHECK-NEXT: cmp r1, #3
; CHECK-NEXT: bhi .LBB15_6
; CHECK-NEXT: @ %bb.1: @ %if.then
; CHECK-NEXT: ldr r4, [r0, #8]
-; CHECK-NEXT: ldr r3, [r4, #12]
-; CHECK-NEXT: ldm.w r4, {r7, r8, r9}
-; CHECK-NEXT: add.w r4, r12, lr, lsl #2
-; CHECK-NEXT: lsr.w lr, r6, #2
+; CHECK-NEXT: lsr.w lr, r3, #2
+; CHECK-NEXT: ldrd r7, r6, [r4]
+; CHECK-NEXT: ldrd r5, r8, [r4, #8]
+; CHECK-NEXT: add.w r4, r12, r1, lsl #2
; CHECK-NEXT: wls lr, lr, .LBB15_5
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
-; CHECK-NEXT: strd r6, r5, [sp, #8] @ 8-byte Folded Spill
-; CHECK-NEXT: bic r5, r6, #3
-; CHECK-NEXT: str r0, [sp] @ 4-byte Spill
-; CHECK-NEXT: mov.w r10, #0
-; CHECK-NEXT: mov r0, r5
-; CHECK-NEXT: add.w r5, r2, r5, lsl #2
-; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: bic r1, r3, #3
+; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
+; CHECK-NEXT: add.w r10, r12, #4
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: mov r1, r11
; CHECK-NEXT: .LBB15_3: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add.w r11, r1, r10
-; CHECK-NEXT: add.w r5, r4, r10
-; CHECK-NEXT: vldrw.u32 q0, [r11]
-; CHECK-NEXT: add.w r6, r12, r10
-; CHECK-NEXT: vstrw.32 q0, [r5]
-; CHECK-NEXT: add.w r5, r2, r10
-; CHECK-NEXT: vldrw.u32 q0, [r6]
-; CHECK-NEXT: vldrw.u32 q1, [r6, #4]
-; CHECK-NEXT: vldrw.u32 q2, [r6, #12]
-; CHECK-NEXT: add.w r10, r10, #16
+; CHECK-NEXT: vldrw.u32 q0, [r1], #16
+; CHECK-NEXT: vstrb.8 q0, [r4], #16
+; CHECK-NEXT: vldrw.u32 q0, [r10, #-4]
+; CHECK-NEXT: vldrw.u32 q1, [r10], #16
; CHECK-NEXT: vmul.f32 q0, q0, r7
-; CHECK-NEXT: vfma.f32 q0, q1, r8
-; CHECK-NEXT: vldrw.u32 q1, [r6, #8]
-; CHECK-NEXT: vfma.f32 q0, q1, r9
-; CHECK-NEXT: vfma.f32 q0, q2, r3
-; CHECK-NEXT: vstrw.32 q0, [r5]
+; CHECK-NEXT: vldrw.u32 q2, [r10, #-8]
+; CHECK-NEXT: vfma.f32 q0, q1, r6
+; CHECK-NEXT: vldrw.u32 q1, [r10, #-12]
+; CHECK-NEXT: vfma.f32 q0, q1, r5
+; CHECK-NEXT: vfma.f32 q0, q2, r8
+; CHECK-NEXT: vstrb.8 q0, [r2], #16
; CHECK-NEXT: le lr, .LBB15_3
; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit
-; CHECK-NEXT: ldr r5, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: add r4, r10
-; CHECK-NEXT: add.w r12, r12, r0, lsl #2
-; CHECK-NEXT: add.w r1, r1, r0, lsl #2
-; CHECK-NEXT: ldm.w sp, {r0, r2, r6} @ 12-byte Folded Reload
+; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
+; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: add.w r12, r12, r1, lsl #2
+; CHECK-NEXT: add.w r11, r11, r1, lsl #2
; CHECK-NEXT: .LBB15_5: @ %while.end
-; CHECK-NEXT: and lr, r6, #3
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vctp.32 lr
+; CHECK-NEXT: and r1, r3, #3
+; CHECK-NEXT: vldrw.u32 q0, [r11]
+; CHECK-NEXT: vctp.32 r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [r4]
; CHECK-NEXT: vldrw.u32 q0, [r12]
; CHECK-NEXT: vldrw.u32 q1, [r12, #4]
; CHECK-NEXT: vmul.f32 q0, q0, r7
-; CHECK-NEXT: vfma.f32 q0, q1, r8
+; CHECK-NEXT: vfma.f32 q0, q1, r6
; CHECK-NEXT: vldrw.u32 q1, [r12, #8]
-; CHECK-NEXT: vfma.f32 q0, q1, r9
+; CHECK-NEXT: vfma.f32 q0, q1, r5
; CHECK-NEXT: vldrw.u32 q1, [r12, #12]
-; CHECK-NEXT: vfma.f32 q0, q1, r3
+; CHECK-NEXT: vfma.f32 q0, q1, r8
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [r2]
; CHECK-NEXT: ldr.w r12, [r0, #4]
; CHECK-NEXT: .LBB15_6: @ %if.end
-; CHECK-NEXT: add.w r0, r12, r6, lsl #2
-; CHECK-NEXT: lsr.w lr, r5, #2
+; CHECK-NEXT: add.w r0, r12, r3, lsl #2
+; CHECK-NEXT: lsr.w lr, r9, #2
; CHECK-NEXT: wls lr, lr, .LBB15_10
; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader
-; CHECK-NEXT: bic r2, r5, #3
+; CHECK-NEXT: bic r2, r9, #3
+; CHECK-NEXT: adds r1, r2, r3
; CHECK-NEXT: mov r3, r12
-; CHECK-NEXT: adds r1, r2, r6
; CHECK-NEXT: add.w r1, r12, r1, lsl #2
; CHECK-NEXT: .LBB15_8: @ %while.body51
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, r2, lsl #2
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: .LBB15_10: @ %while.end55
-; CHECK-NEXT: ands r1, r5, #3
+; CHECK-NEXT: ands r1, r9, #3
; CHECK-NEXT: beq .LBB15_12
; CHECK-NEXT: @ %bb.11: @ %if.then59
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [r12]
; CHECK-NEXT: .LBB15_12: @ %if.end61
-; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: add sp, #8
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1