From 228c943f316eea630dfb94e270af7e342bd5dd56 Mon Sep 17 00:00:00 2001 From: Tim Northover Date: Wed, 5 Nov 2014 00:27:13 +0000 Subject: [PATCH] ARM/Dwarf: correctly align stack before callee-saved VPRs We were making an attempt to do this by adding an extra callee-saved GPR (so that there was an even number in the list), but when that failed we went ahead and pushed anyway. This had a couple of potential issues: + The .cfi directives we emit misplaced dN because they were based on PrologEpilogInserter's calculation. + Unaligned stores can be less efficient. + Unaligned stores can actually fault (likely only an issue in niche cases, but possible). This adds a final explicit stack adjustment if all other options fail, so that the actual locations of the registers match up with where they should be. llvm-svn: 221320 --- llvm/lib/Target/ARM/ARMFrameLowering.cpp | 26 +++++++++-- llvm/lib/Target/ARM/ARMMachineFunctionInfo.h | 5 +- llvm/test/CodeGen/ARM/dwarf-unwind.ll | 68 ++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/ARM/dwarf-unwind.ll diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp index 74f6865..4589799 100644 --- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -260,10 +260,11 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { // Determine starting offsets of spill areas. bool HasFP = hasFP(MF); - unsigned DPRCSOffset = NumBytes - (ArgRegsSaveSize + GPRCS1Size - + GPRCS2Size + DPRCSSize); - unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize; - unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size; + unsigned GPRCS1Offset = NumBytes - ArgRegsSaveSize - GPRCS1Size; + unsigned GPRCS2Offset = GPRCS1Offset - GPRCS2Size; + unsigned DPRAlign = DPRCSSize ? std::min(8U, Align) : 4U; + unsigned DPRGapSize = (GPRCS1Size + GPRCS2Size + ArgRegsSaveSize) % DPRAlign; + unsigned DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize; int FramePtrOffsetInPush = 0; if (HasFP) { FramePtrOffsetInPush = MFI->getObjectOffset(FramePtrSpillFI) @@ -279,6 +280,15 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { if (GPRCS2Size > 0) GPRCS2Push = LastPush = MBBI++; + // Prolog/epilog inserter assumes we correctly align DPRs on the stack, so our + // .cfi_offset operations will reflect that. + if (DPRGapSize) { + assert(DPRGapSize == 4 && "unexpected alignment requirements for DPRs"); + if (!tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, DPRGapSize)) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRGapSize, + MachineInstr::FrameSetup); + } + // Move past area 3. if (DPRCSSize > 0) { DPRCSPush = MBBI; @@ -508,6 +518,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { AFI->setGPRCalleeSavedArea1Size(GPRCS1Size); AFI->setGPRCalleeSavedArea2Size(GPRCS2Size); + AFI->setDPRCalleeSavedGapSize(DPRGapSize); AFI->setDPRCalleeSavedAreaSize(DPRCSSize); // If we need dynamic stack realignment, do it here. Be paranoid and make @@ -613,6 +624,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, NumBytes -= (ArgRegsSaveSize + AFI->getGPRCalleeSavedArea1Size() + AFI->getGPRCalleeSavedArea2Size() + + AFI->getDPRCalleeSavedGapSize() + AFI->getDPRCalleeSavedAreaSize()); // Reset SP based on frame pointer only if the stack frame extends beyond @@ -661,6 +673,12 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, while (MBBI->getOpcode() == ARM::VLDMDIA_UPD) MBBI++; } + if (AFI->getDPRCalleeSavedGapSize()) { + assert(AFI->getDPRCalleeSavedGapSize() == 4 && + "unexpected DPR alignment gap"); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedGapSize()); + } + if (AFI->getGPRCalleeSavedArea2Size()) MBBI++; if (AFI->getGPRCalleeSavedArea1Size()) MBBI++; } diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index 4f9ea7a..4e67fa1 100644 --- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -86,6 +86,7 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// areas. unsigned GPRCS1Size; unsigned GPRCS2Size; + unsigned DPRCSAlignGapSize; unsigned DPRCSSize; /// NumAlignedDPRCS2Regs - The number of callee-saved DPRs that are saved in @@ -134,7 +135,7 @@ public: RestoreSPFromFP(false), LRSpilledForFarJump(false), FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), - GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), + GPRCS1Size(0), GPRCS2Size(0), DPRCSAlignGapSize(0), DPRCSSize(0), NumAlignedDPRCS2Regs(0), JumpTableUId(0), PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {} @@ -183,10 +184,12 @@ public: unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; } unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; } + unsigned getDPRCalleeSavedGapSize() const { return DPRCSAlignGapSize; } unsigned getDPRCalleeSavedAreaSize() const { return DPRCSSize; } void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; } void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; } + void setDPRCalleeSavedGapSize(unsigned s) { DPRCSAlignGapSize = s; } void setDPRCalleeSavedAreaSize(unsigned s) { DPRCSSize = s; } unsigned getArgumentStackSize() const { return ArgumentStackSize; } diff --git a/llvm/test/CodeGen/ARM/dwarf-unwind.ll b/llvm/test/CodeGen/ARM/dwarf-unwind.ll new file mode 100644 index 0000000..58f486d --- /dev/null +++ b/llvm/test/CodeGen/ARM/dwarf-unwind.ll @@ -0,0 +1,68 @@ +; RUN: llc -mtriple=thumbv7-netbsd-eabi -o - %s | FileCheck %s +declare void @bar() + +; ARM's frame lowering attempts to tack another callee-saved register onto the +; list when it detects a potential misaligned VFP store. However, if there are +; none available it used to just vpush anyway and misreport the location of the +; registers in unwind info. Since there are benefits to aligned stores, it's +; better to correct the code than the .cfi_offset directive. + +define void @test_dpr_align(i8 %l, i8 %r) { +; CHECK-LABEL: test_dpr_align: +; CHECK: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK: sub sp, #4 +; CHECK: vpush {d8} +; CHECK: .cfi_offset d8, -48 +; CHECK-NOT: sub sp +; [...] +; CHECK: bl bar +; CHECK-NOT: add sp +; CHECK: vpop {d8} +; CHECK: add sp, #4 +; CHECK: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} + call void asm sideeffect "", "~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{d8}"() + call void @bar() + ret void +} + +; The prologue (but not the epilogue) can be made more space efficient by +; chucking an argument register into the list. Not worth it in general though, +; "sub sp, #4" is likely faster. +define void @test_dpr_align_tiny(i8 %l, i8 %r) minsize { +; CHECK-LABEL: test_dpr_align_tiny: +; CHECK: push.w {r3, r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NOT: sub sp +; CHECK: vpush {d8} +; CHECK: .cfi_offset d8, -48 +; CHECK-NOT: sub sp +; [...] +; CHECK: bl bar +; CHECK-NOT: add sp +; CHECK: vpop {d8} +; CHECK: add sp, #4 +; CHECK: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} + call void asm sideeffect "", "~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{d8}"() + call void @bar() + ret void +} + + +; However, we shouldn't do a 2-step align/adjust if there are no DPRs to be +; saved. +define void @test_nodpr_noalign(i8 %l, i8 %r) { +; CHECK-LABEL: test_nodpr_noalign: +; CHECK: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NOT: sub sp +; CHECK: sub sp, #12 +; CHECK-NOT: sub sp +; [...] +; CHECK: bl bar +; CHECK-NOT: add sp +; CHECK: add sp, #12 +; CHECK-NOT: add sp +; CHECK: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} + alloca i64 + call void asm sideeffect "", "~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11}"() + call void @bar() + ret void +} -- 2.7.4