/// Searches for a increment or decrement of \p Reg after \p MBBI.
static MachineBasicBlock::iterator
findIncDecAfter(MachineBasicBlock::iterator MBBI, Register Reg,
- ARMCC::CondCodes Pred, Register PredReg, int &Offset) {
+ ARMCC::CondCodes Pred, Register PredReg, int &Offset,
+ const TargetRegisterInfo *TRI) {
Offset = 0;
MachineBasicBlock &MBB = *MBBI->getParent();
MachineBasicBlock::iterator EndMBBI = MBB.end();
MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
- // Skip debug values.
- while (NextMBBI != EndMBBI && NextMBBI->isDebugInstr())
- ++NextMBBI;
- if (NextMBBI == EndMBBI)
- return EndMBBI;
+ while (NextMBBI != EndMBBI) {
+ // Skip debug values.
+ while (NextMBBI != EndMBBI && NextMBBI->isDebugInstr())
+ ++NextMBBI;
+ if (NextMBBI == EndMBBI)
+ return EndMBBI;
+
+ unsigned Off = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
+ if (Off) {
+ Offset = Off;
+ return NextMBBI;
+ }
- Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
- return Offset == 0 ? EndMBBI : NextMBBI;
+ // SP can only be combined if it is the next instruction after the original
+ // MBBI, otherwise we may be incrementing the stack pointer (invalidating
+ // anything below the new pointer) when its frame elements are still in
+ // use. Other registers can attempt to look further, until a different use
+ // or def of the register is found.
+ if (Reg == ARM::SP || NextMBBI->readsRegister(Reg, TRI) ||
+ NextMBBI->definesRegister(Reg, TRI))
+ return EndMBBI;
+
+ ++NextMBBI;
+ }
+ return EndMBBI;
}
/// Fold proceeding/trailing inc/dec of base register into the
} else if (Mode == ARM_AM::ib && Offset == -Bytes) {
Mode = ARM_AM::da;
} else {
- MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+ MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset, TRI);
if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes)) {
} else if (Offset == -Bytes) {
NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
} else {
- MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+ MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset, TRI);
if (Offset == Bytes) {
NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
} else if (!isAM5 && Offset == -Bytes) {
if (Offset == 8 || Offset == -8) {
NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE;
} else {
- MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+ MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset, TRI);
if (Offset == 8 || Offset == -8) {
NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST;
} else
define i32* @post_inc_ldrd(i32* %base, i32* %addr.3) {
; CHECK-V8M-LABEL: post_inc_ldrd:
; CHECK-V8M: @ %bb.0:
-; CHECK-V8M-NEXT: ldrd r2, r3, [r0]
-; CHECK-V8M-NEXT: adds r0, #8
+; CHECK-V8M-NEXT: ldrd r2, r3, [r0], #8
; CHECK-V8M-NEXT: add r2, r3
; CHECK-V8M-NEXT: str r2, [r1]
; CHECK-V8M-NEXT: bx lr
;
; CHECK-V8A-LABEL: post_inc_ldrd:
; CHECK-V8A: @ %bb.0:
-; CHECK-V8A-NEXT: ldm r0, {r2, r3}
-; CHECK-V8A-NEXT: add r0, r0, #8
+; CHECK-V8A-NEXT: ldm r0!, {r2, r3}
; CHECK-V8A-NEXT: add r2, r2, r3
; CHECK-V8A-NEXT: str r2, [r1]
; CHECK-V8A-NEXT: bx lr
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
; CHECK-NEXT: .LBB0_10: @ %for.body.epil
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldr s0, [r1]
-; CHECK-NEXT: adds r1, #4
-; CHECK-NEXT: vldr s2, [r2]
-; CHECK-NEXT: adds r2, #4
+; CHECK-NEXT: vldmia r1!, {s0}
+; CHECK-NEXT: vldmia r2!, {s2}
; CHECK-NEXT: vmul.f32 s0, s2, s0
-; CHECK-NEXT: vstr s0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstmia r0!, {s0}
; CHECK-NEXT: le lr, .LBB0_10
; CHECK-NEXT: .LBB0_11: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
; CHECK-NEXT: add.w r7, r2, r12, lsl #2
; CHECK-NEXT: .LBB0_6: @ %for.body.prol
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldr s0, [r6]
-; CHECK-NEXT: adds r6, #4
-; CHECK-NEXT: vldr s2, [r5]
-; CHECK-NEXT: adds r5, #4
+; CHECK-NEXT: vldmia r6!, {s0}
; CHECK-NEXT: add.w r12, r12, #1
+; CHECK-NEXT: vldmia r5!, {s2}
; CHECK-NEXT: vmul.f32 s0, s2, s0
-; CHECK-NEXT: vstr s0, [r7]
-; CHECK-NEXT: adds r7, #4
+; CHECK-NEXT: vstmia r7!, {s0}
; CHECK-NEXT: le lr, .LBB0_6
; CHECK-NEXT: .LBB0_7: @ %for.body.prol.loopexit
; CHECK-NEXT: cmp r4, #3
; CHECK-NEXT: add.w r7, r2, r12, lsl #2
; CHECK-NEXT: .LBB1_6: @ %for.body.prol
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldr s0, [r6]
-; CHECK-NEXT: adds r6, #4
-; CHECK-NEXT: vldr s2, [r5]
-; CHECK-NEXT: adds r5, #4
+; CHECK-NEXT: vldmia r6!, {s0}
; CHECK-NEXT: add.w r12, r12, #1
+; CHECK-NEXT: vldmia r5!, {s2}
; CHECK-NEXT: vadd.f32 s0, s2, s0
-; CHECK-NEXT: vstr s0, [r7]
-; CHECK-NEXT: adds r7, #4
+; CHECK-NEXT: vstmia r7!, {s0}
; CHECK-NEXT: le lr, .LBB1_6
; CHECK-NEXT: .LBB1_7: @ %for.body.prol.loopexit
; CHECK-NEXT: cmp r4, #3
; CHECK-NEXT: add.w r7, r2, r12, lsl #2
; CHECK-NEXT: .LBB2_6: @ %for.body.prol
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldr s0, [r6]
-; CHECK-NEXT: adds r6, #4
-; CHECK-NEXT: vldr s2, [r5]
-; CHECK-NEXT: adds r5, #4
+; CHECK-NEXT: vldmia r6!, {s0}
; CHECK-NEXT: add.w r12, r12, #1
+; CHECK-NEXT: vldmia r5!, {s2}
; CHECK-NEXT: vsub.f32 s0, s2, s0
-; CHECK-NEXT: vstr s0, [r7]
-; CHECK-NEXT: adds r7, #4
+; CHECK-NEXT: vstmia r7!, {s0}
; CHECK-NEXT: le lr, .LBB2_6
; CHECK-NEXT: .LBB2_7: @ %for.body.prol.loopexit
; CHECK-NEXT: cmp r4, #3
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r4, [r6], #4
; CHECK-NEXT: add.w r12, r12, #1
-; CHECK-NEXT: vldr s2, [r5]
-; CHECK-NEXT: adds r5, #4
+; CHECK-NEXT: vldmia r5!, {s2}
; CHECK-NEXT: vmov s0, r4
; CHECK-NEXT: vcvt.f32.s32 s0, s0
; CHECK-NEXT: vmul.f32 s0, s2, s0
-; CHECK-NEXT: vstr s0, [r7]
-; CHECK-NEXT: adds r7, #4
+; CHECK-NEXT: vstmia r7!, {s0}
; CHECK-NEXT: le lr, .LBB3_9
; CHECK-NEXT: .LBB3_10: @ %for.body.prol.loopexit
; CHECK-NEXT: cmp.w r8, #3
; CHECK-NEXT: adds r1, #2
; CHECK-NEXT: vmul.f16 s0, s2, s0
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
-; CHECK-NEXT: vstr s0, [r2]
-; CHECK-NEXT: adds r2, #4
+; CHECK-NEXT: vstmia r2!, {s0}
; CHECK-NEXT: le lr, .LBB5_7
; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
; CHECK-NEXT: adds r1, #2
; CHECK-NEXT: vadd.f16 s0, s2, s0
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
-; CHECK-NEXT: vstr s0, [r2]
-; CHECK-NEXT: adds r2, #4
+; CHECK-NEXT: vstmia r2!, {s0}
; CHECK-NEXT: le lr, .LBB6_7
; CHECK-NEXT: .LBB6_8: @ %for.cond.cleanup
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
; CHECK-NEXT: adds r1, #2
; CHECK-NEXT: vsub.f16 s0, s2, s0
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
-; CHECK-NEXT: vstr s0, [r2]
-; CHECK-NEXT: adds r2, #4
+; CHECK-NEXT: vstmia r2!, {s0}
; CHECK-NEXT: le lr, .LBB7_7
; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
; CHECK-NEXT: vcvt.f16.s32 s2, s2
; CHECK-NEXT: vmul.f16 s0, s0, s2
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
-; CHECK-NEXT: vstr s0, [r2]
-; CHECK-NEXT: adds r2, #4
+; CHECK-NEXT: vstmia r2!, {s0}
; CHECK-NEXT: le lr, .LBB8_7
; CHECK-NEXT: .LBB8_8: @ %for.cond.cleanup
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
; CHECK-NEXT: adds r1, #8
; CHECK-NEXT: vfma.f32 q5, q4, r5
; CHECK-NEXT: vfma.f32 q3, q5, q2
-; CHECK-NEXT: vstmia r7, {s20, s21}
-; CHECK-NEXT: adds r7, #8
+; CHECK-NEXT: vstmia r7!, {s20, s21}
; CHECK-NEXT: vfma.f32 q3, q4, q1
; CHECK-NEXT: vstrw.32 q3, [r4]
; CHECK-NEXT: le lr, .LBB17_3
; CHECK-NEXT: .LBB20_5: @ %while.body
; CHECK-NEXT: @ Parent Loop BB20_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldrd r7, r4, [r1]
-; CHECK-NEXT: adds r1, #8
+; CHECK-NEXT: ldrd r7, r4, [r1], #8
; CHECK-NEXT: vfma.f32 q6, q3, r7
; CHECK-NEXT: vmov r7, s24
; CHECK-NEXT: vmov q1, q6
; CHECK-NEXT: add.w r2, r2, r12, lsl #2
; CHECK-NEXT: .LBB2_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldr s0, [r0]
-; CHECK-NEXT: adds r0, #4
-; CHECK-NEXT: vldr s2, [r1]
-; CHECK-NEXT: adds r1, #4
+; CHECK-NEXT: vldmia r0!, {s0}
+; CHECK-NEXT: vldmia r1!, {s2}
; CHECK-NEXT: vldr s4, [r2]
; CHECK-NEXT: vfma.f32 s4, s2, s0
-; CHECK-NEXT: vstr s4, [r2]
-; CHECK-NEXT: adds r2, #4
+; CHECK-NEXT: vstmia r2!, {s4}
; CHECK-NEXT: le lr, .LBB2_7
; CHECK-NEXT: .LBB2_8: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, r5, r6, pc}
; CHECK-NEXT: add.w r2, r2, r12, lsl #2
; CHECK-NEXT: .LBB0_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldr s0, [r0]
-; CHECK-NEXT: adds r0, #4
-; CHECK-NEXT: vldr s2, [r1]
-; CHECK-NEXT: adds r1, #4
+; CHECK-NEXT: vldmia r0!, {s0}
+; CHECK-NEXT: vldmia r1!, {s2}
; CHECK-NEXT: vldr s4, [r2]
; CHECK-NEXT: vfma.f32 s4, s2, s0
-; CHECK-NEXT: vstr s4, [r2]
-; CHECK-NEXT: adds r2, #4
+; CHECK-NEXT: vstmia r2!, {s4}
; CHECK-NEXT: le lr, .LBB0_7
; CHECK-NEXT: .LBB0_8: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, r5, r6, pc}
; CHECK-NEXT: vmvn.i32 q1, #0x80000000
; CHECK-NEXT: .LBB0_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldrd r5, r4, [r0]
+; CHECK-NEXT: ldrd r5, r4, [r0], #8
; CHECK-NEXT: mov.w r3, #-1
-; CHECK-NEXT: ldrd r8, r7, [r1]
-; CHECK-NEXT: adds r0, #8
+; CHECK-NEXT: ldrd r8, r7, [r1], #8
; CHECK-NEXT: smull r4, r7, r7, r4
-; CHECK-NEXT: adds r1, #8
; CHECK-NEXT: asrl r4, r7, #31
; CHECK-NEXT: smull r6, r5, r8, r5
; CHECK-NEXT: rsbs.w r9, r4, #-2147483648
; CHECK-NEXT: vorr q2, q2, q4
; CHECK-NEXT: vmov r3, s10
; CHECK-NEXT: vmov r4, s8
-; CHECK-NEXT: strd r4, r3, [r2]
-; CHECK-NEXT: adds r2, #8
+; CHECK-NEXT: strd r4, r3, [r2], #8
; CHECK-NEXT: le lr, .LBB0_4
; CHECK-NEXT: @ %bb.5: @ %middle.block
; CHECK-NEXT: ldrd r7, r3, [sp] @ 8-byte Folded Reload
; CHECK-NEXT: add.w r12, r0, r5, lsl #2
; CHECK-NEXT: .LBB3_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldrd r4, r7, [r0]
-; CHECK-NEXT: adds r0, #8
-; CHECK-NEXT: ldrd r5, r10, [r1]
-; CHECK-NEXT: adds r1, #8
+; CHECK-NEXT: ldrd r4, r7, [r0], #8
+; CHECK-NEXT: ldrd r5, r10, [r1], #8
; CHECK-NEXT: umull r4, r5, r5, r4
; CHECK-NEXT: lsrl r4, r5, #31
; CHECK-NEXT: subs.w r6, r4, #-1
; CHECK-NEXT: vorn q0, q1, q0
; CHECK-NEXT: vmov r4, s2
; CHECK-NEXT: vmov r5, s0
-; CHECK-NEXT: strd r5, r4, [r2]
-; CHECK-NEXT: adds r2, #8
+; CHECK-NEXT: strd r5, r4, [r2], #8
; CHECK-NEXT: le lr, .LBB3_4
; CHECK-NEXT: @ %bb.5: @ %middle.block
; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
; CHECK-NEXT: .LBB5_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldr s2, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldmia r0!, {s2}
; CHECK-NEXT: vadd.f32 s0, s2, s0
; CHECK-NEXT: le lr, .LBB5_8
; CHECK-NEXT: .LBB5_9: @ %for.cond.cleanup
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
; CHECK-NEXT: .LBB6_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldr s2, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldmia r0!, {s2}
; CHECK-NEXT: vmul.f32 s0, s2, s0
; CHECK-NEXT: le lr, .LBB6_8
; CHECK-NEXT: .LBB6_9: @ %for.cond.cleanup
; CHECK-NEXT: adds r3, #8
; CHECK-NEXT: vmul.f32 s0, s0, s0
; CHECK-NEXT: vfma.f32 s0, s2, s2
-; CHECK-NEXT: vstr s0, [r12]
-; CHECK-NEXT: add.w r12, r12, #4
+; CHECK-NEXT: vstmia r12!, {s0}
; CHECK-NEXT: le lr, .LBB1_7
; CHECK-NEXT: .LBB1_8: @ %while.end
; CHECK-NEXT: pop {r4, r5, r7, pc}