From 7d07405761aec8434a0cdb1c5644823a394f7def Mon Sep 17 00:00:00 2001 From: =?utf8?q?Martin=20Storsj=C3=B6?= Date: Wed, 23 Sep 2020 15:00:53 +0300 Subject: [PATCH] [AArch64] Prefer prologues with sp adjustments merged into stp/ldp for WinCFI, if optimizing for size This makes the prologue match the windows canonical layout, for cases without a frame pointer. This can potentially be a slower (a longer dependency chain of the sp register, and potentially one arithmetic operation more on some cores), but gives notable size improvements. The previous two commits shrinks a 166 KB xdata section by 49 KB, and if the change from this commit is enabled, it shrinks the xdata section by another 25 KB. In total, since the start of the recent arm64 unwind info cleanups and optimizations (since before commit 37ef743cbf3), the xdata+pdata sections of the same test DLL has shrunk from 407 KB in total originally, to 163 KB now. Differential Revision: https://reviews.llvm.org/D88701 --- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 24 ++++++-- .../CodeGen/AArch64/wineh-frame-predecrement.mir | 70 ++++++++++++++++++++++ 2 files changed, 88 insertions(+), 6 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/wineh-frame-predecrement.mir diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index d33ebdd..0d52b00 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -579,6 +579,12 @@ static bool windowsRequiresStackProbe(MachineFunction &MF, !F.hasFnAttribute("no-stack-arg-probe"); } +static bool needsWinCFI(const MachineFunction &MF) { + const Function &F = MF.getFunction(); + return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() && + F.needsUnwindTableEntry(); +} + bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( MachineFunction &MF, uint64_t StackBumpBytes) const { AArch64FunctionInfo *AFI = MF.getInfo(); @@ -589,6 +595,18 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( if (AFI->getLocalStackSize() == 0) return false; + // For WinCFI, if optimizing for size, prefer to not combine the stack bump + // (to force a stp with predecrement) to match the packed unwind format, + // provided that there actually are any callee saved registers to merge the + // decrement with. + // This is potentially marginally slower, but allows using the packed + // unwind format for functions that both have a local area and callee saved + // registers. Using the packed unwind format notably reduces the size of + // the unwind info. + if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 && + MF.getFunction().hasOptSize()) + return false; + // 512 is the maximum immediate for stp/ldp that will be used for // callee-save save/restores if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes)) @@ -982,12 +1000,6 @@ static void adaptForLdStOpt(MachineBasicBlock &MBB, // } -static bool needsWinCFI(const MachineFunction &MF) { - const Function &F = MF.getFunction(); - return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() && - F.needsUnwindTableEntry(); -} - static bool isTargetWindows(const MachineFunction &MF) { return MF.getSubtarget().isTargetWindows(); } diff --git a/llvm/test/CodeGen/AArch64/wineh-frame-predecrement.mir b/llvm/test/CodeGen/AArch64/wineh-frame-predecrement.mir new file mode 100644 index 0000000..1bed8f6 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/wineh-frame-predecrement.mir @@ -0,0 +1,70 @@ +# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \ +# RUN: -stop-after=prologepilog | FileCheck %s + +# Check that the callee-saved registers are saved starting with a STP +# with predecrement, followed by a separate stack adjustment later, +# if the optsize attribute is set. + +# CHECK: early-clobber $sp = frame-setup STPXpre killed $x19, killed $x20, $sp, -2 +# CHECK-NEXT: frame-setup SEH_SaveRegP_X 19, 20, -16 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: frame-setup SEH_StackAlloc 16 +# CHECK-NEXT: frame-setup SEH_PrologEnd + +--- | + + define dso_local i32 @func(i32 %a) optsize { ret i32 %a } + +... +--- +name: func +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: [] +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 4 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: default, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: -4, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x19, $x20 + + renamable $x8 = ADDXri %stack.0, 0, 0 + $x19 = ADDXrr $x0, $x8 + $x20 = ADDXrr $x19, $x0 + $x0 = ADDXrr $x0, killed $x20 + + RET_ReallyLR + +... -- 2.7.4