OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / 8);
}
+static void adaptForLdStOpt(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator FirstSPPopI,
+ MachineBasicBlock::iterator LastPopI) {
+ // Sometimes (when we restore in the same order as we save), we can end up
+ // with code like this:
+ //
+ // ldp x26, x25, [sp]
+ // ldp x24, x23, [sp, #16]
+ // ldp x22, x21, [sp, #32]
+ // ldp x20, x19, [sp, #48]
+ // add sp, sp, #64
+ //
+ // In this case, it is always better to put the first ldp at the end, so
+ // that the load-store optimizer can run and merge the ldp and the add into
+ // a post-index ldp.
+ // If we managed to grab the first pop instruction, move it to the end.
+ if (ReverseCSRRestoreSeq)
+ MBB.splice(FirstSPPopI, &MBB, LastPopI);
+ // We should end up with something like this now:
+ //
+ // ldp x24, x23, [sp, #16]
+ // ldp x22, x21, [sp, #32]
+ // ldp x20, x19, [sp, #48]
+ // ldp x26, x25, [sp]
+ // add sp, sp, #64
+ //
+ // and the load-store optimizer can merge the last two instructions into:
+ //
+ // ldp x26, x25, [sp], #64
+ //
+}
+
void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator MBBI = MBB.begin();
int StackRestoreBytes = RedZone ? 0 : NumBytes;
if (NoCalleeSaveRestore)
StackRestoreBytes += AfterCSRPopSize;
- emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
- StackRestoreBytes, TII, MachineInstr::FrameDestroy);
+
// If we were able to combine the local stack pop with the argument pop,
// then we're done.
- if (NoCalleeSaveRestore || AfterCSRPopSize == 0)
+ bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
+
+ // If we're done after this, make sure to help the load store optimizer.
+ if (Done)
+ adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
+
+ emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+ StackRestoreBytes, TII, MachineInstr::FrameDestroy);
+ if (Done)
return;
+
NumBytes = 0;
}
FirstSPPopI = Prev;
}
- // Sometimes (when we restore in the same order as we save), we can end up
- // with code like this:
- //
- // ldp x26, x25, [sp]
- // ldp x24, x23, [sp, #16]
- // ldp x22, x21, [sp, #32]
- // ldp x20, x19, [sp, #48]
- // add sp, sp, #64
- //
- // In this case, it is always better to put the first ldp at the end, so
- // that the load-store optimizer can run and merge the ldp and the add into
- // a post-index ldp.
- // If we managed to grab the first pop instruction, move it to the end.
- if (LastPopI != Begin)
- MBB.splice(FirstSPPopI, &MBB, LastPopI);
- // We should end up with something like this now:
- //
- // ldp x24, x23, [sp, #16]
- // ldp x22, x21, [sp, #32]
- // ldp x20, x19, [sp, #48]
- // ldp x26, x25, [sp]
- // add sp, sp, #64
- //
- // and the load-store optimizer can merge the last two instructions into:
- //
- // ldp x26, x25, [sp], #64
- //
+ adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
+
emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
AfterCSRPopSize, TII, MachineInstr::FrameDestroy);
}
define void @bar() nounwind { entry: unreachable }
+ define void @baz() nounwind { entry: unreachable }
+
...
---
name: foo
; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 80, 0
RET_ReallyLR
...
+---
+# Check that the load from the offset 0 is moved at the end even when hasFP is
+# false.
+name: baz
+# CHECK-LABEL: name: baz
+alignment: 2
+tracksRegLiveness: true
+frameInfo:
+ adjustsStack: true
+ hasCalls: true
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $x0 = IMPLICIT_DEF
+ $x20 = IMPLICIT_DEF
+ $x21 = IMPLICIT_DEF
+
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $x0
+ ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ B %bb.1
+
+ bb.1:
+ ; CHECK: $x20, $lr = frame-destroy LDPXi $sp, 2
+ ; BEFORELDSTOPT-NEXT: $x21 = frame-destroy LDRXui $sp, 0
+ ; BEFORELDSTOPT-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0
+
+ ; AFTERLDSTOPT-NEXT: early-clobber $sp, $x21 = frame-destroy LDRXpost $sp, 32
+ RET_ReallyLR
+...