/// Set of functions that require the stack to be 16B aligned
DenseSet<const BinaryFunction *> FunctionsRequireAlignment;
+ /// Set of functions that performs computations with stack addresses and
+ /// complicates our understanding of aliasing of stack spaces.
+ DenseSet<const BinaryFunction *> FunctionsWithStackArithmetic;
+
/// Owns ArgAccesses for all instructions. References to elements are
/// attached to instructions as indexes to this vector, in MCAnnotations.
std::vector<ArgAccesses> ArgAccessesVector;
return FunctionsRequireAlignment.count(&Func);
}
+ /// Return true if \p Func does computation with the address of any stack
+ /// position, meaning we have limited alias analysis on this function.
+ bool hasStackArithmetic(const BinaryFunction &Func) const {
+ return FunctionsWithStackArithmetic.count(&Func);
+ }
+
/// Functions for retrieving our specific MCAnnotation data from instructions
ErrorOr<ArgAccesses &> getArgAccessesFor(const MCInst &Inst);
static cl::opt<bool> TimeFA("time-fa", cl::desc("time frame analysis steps"),
cl::ReallyHidden, cl::cat(BoltOptCategory));
+static cl::opt<bool>
+ ExperimentalSW("experimental-shrink-wrapping",
+ cl::desc("process functions with stack pointer arithmetic"),
+ cl::ReallyHidden, cl::ZeroOrMore, cl::cat(BoltOptCategory));
+
bool shouldFrameOptimize(const llvm::bolt::BinaryFunction &Function) {
if (Function.hasUnknownControlFlow())
return false;
const MCInst *Prev{nullptr};
/// Info about the last frame access
bool IsValidAccess{false};
+ bool EscapesStackAddress{false};
FrameIndexEntry FIE;
bool decodeFrameAccess(const MCInst &Inst) {
return true;
}
- if (IsIndexed || FIE.Size == 0) {
+ if (IsIndexed || (!FIE.Size && (FIE.IsLoad || FIE.IsStore))) {
LLVM_DEBUG(dbgs() << "Giving up on indexed memory access/unknown size\n");
LLVM_DEBUG(dbgs() << "Blame insn: ");
+ LLVM_DEBUG(BC.printInstruction(outs(), Inst, 0, &BF, true, false, false));
LLVM_DEBUG(Inst.dump());
return false;
}
- assert(FIE.Size != 0);
+ assert(FIE.Size != 0 || (!FIE.IsLoad && !FIE.IsStore));
FIE.RegOrImm = SrcImm;
if (FIE.IsLoad || FIE.IsStoreFromReg)
const FrameIndexEntry &getFIE() const { return FIE; }
int getSPOffset() const { return SPOffset; }
bool isValidAccess() const { return IsValidAccess; }
+ bool doesEscapeStackAddress() const { return EscapesStackAddress; }
bool doNext(const BinaryBasicBlock &BB, const MCInst &Inst) {
IsValidAccess = false;
+ EscapesStackAddress = false;
std::tie(SPOffset, FPOffset) =
Prev ? *SPT.getStateAt(*Prev) : *SPT.getStateAt(BB);
Prev = &Inst;
}
if (BC.MIB->escapesVariable(Inst, SPT.HasFramePointer)) {
- LLVM_DEBUG(
- dbgs() << "Leaked stack address, giving up on this function.\n");
- LLVM_DEBUG(dbgs() << "Blame insn: ");
- LLVM_DEBUG(Inst.dump());
- return false;
+ EscapesStackAddress = true;
+ if (!opts::ExperimentalSW) {
+ LLVM_DEBUG(
+ dbgs() << "Leaked stack address, giving up on this function.\n");
+ LLVM_DEBUG(dbgs() << "Blame insn: ");
+ LLVM_DEBUG(Inst.dump());
+ return false;
+ }
}
return decodeFrameAccess(Inst);
FAA.enterNewBB();
for (MCInst &Inst : *BB) {
- if (!FAA.doNext(*BB, Inst)) {
+ if (!FAA.doNext(*BB, Inst) || FAA.doesEscapeStackAddress()) {
ArgsTouchedMap[&BF].emplace(std::make_pair(-1, 0));
NoInfo = true;
break;
dbgs() << "\t\t\tSP offset is " << FAA.getSPOffset() << "\n";
});
+ if (FAA.doesEscapeStackAddress()) {
+ if (!FunctionsWithStackArithmetic.count(&BF))
+ FunctionsWithStackArithmetic.insert(&BF);
+ continue;
+ }
+
if (!FAA.isValidAccess())
continue;
{
NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown",
opts::TimeOpts);
- removeUnnecessaryLoads(*RA, *FA, I.second);
+ if (!FA->hasStackArithmetic(I.second))
+ removeUnnecessaryLoads(*RA, *FA, I.second);
}
if (opts::RemoveStores) {
NamedRegionTimer T1("removestores", "remove stores", "FOP",
"FOP breakdown", opts::TimeOpts);
- removeUnusedStores(*FA, I.second);
+ if (!FA->hasStackArithmetic(I.second))
+ removeUnusedStores(*FA, I.second);
}
// Don't even start shrink wrapping if no profiling info is available
if (I.second.getKnownExecutionCount() == 0)
});
return false;
}
+ if (FA.hasStackArithmetic(BF)) {
+ LLVM_DEBUG({
+ dbgs() << "Reg " << CSR
+ << " is not using push/pops due to function "
+ "taking the address of a stack position.\n";
+ });
+ return false;
+ }
for (MCInst *Save : CSA.getSavesByReg(CSR)) {
if (!SLM.canCollapseRegion(Save)) {
LLVM_DEBUG(dbgs() << "Reg " << CSR << " cannot collapse region.\n");
bool IsLoad = MCII.mayLoad();
bool IsStore = MCII.mayStore();
// Is it LEA? (deals with memory but is not loading nor storing)
- if (!IsLoad && !IsStore)
- return false;
+ if (!IsLoad && !IsStore) {
+ I = {0, IsLoad, IsStore, false, false};
+ break;
+ }
uint8_t Sz = getMemDataSize(Inst, MemOpNo);
I = {Sz, IsLoad, IsStore, false, false};
break;
--- /dev/null
+# This checks that frame optimizer does not try to optimize away caller-saved
+# regs when we do not have complete aliasing info (when there is an LEA
+# instruction and the function does arithmetic with stack addresses).
+
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN: %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
+# RUN: llvm-bolt %t.exe -relocs -o %t.out -data %t.fdata \
+# RUN: -frame-opt=all -simplify-conditional-tail-calls=false \
+# RUN: -lite=0 -eliminate-unreachable=false | FileCheck %s
+# RUN: llvm-objdump -d %t.out --print-imm-hex | \
+# RUN: FileCheck --check-prefix CHECK-OBJDUMP %s
+
+ .globl foo
+ .type foo, %function
+foo:
+ .cfi_startproc
+ movq $0, (%rsi)
+ ret
+ .cfi_endproc
+ .size foo, .-foo
+
+
+ .globl _start
+ .type _start, %function
+_start:
+ .cfi_startproc
+# FDATA: 0 [unknown] 0 1 _start 0 0 1
+ push %rbp
+ mov %rsp, %rbp
+ subq $0x20, %rsp
+ je b
+c:
+ addq $0x20, %rsp
+ pop %rbp
+ ret
+b:
+ movq %rdi, %r13
+ movq %r13, -0x08(%rbp)
+ leaq -0x08(%rbp), %rsi
+ callq foo
+ movq -0x08(%rbp), %r13
+ jmp c
+ .cfi_endproc
+ .size _start, .-_start
+
+
+# CHECK: BOLT-INFO: FOP deleted 0 load(s) (dyn count: 0) and 0 store(s)
+
+# CHECK-OBJDUMP: <_start>:
+# CHECK-OBJDUMP: movq %rdi, %r13
+# CHECK-OBJDUMP-NEXT: movq %r13, -0x8(%rbp)
+# CHECK-OBJDUMP-NEXT: leaq
+# CHECK-OBJDUMP-NEXT: callq
+# CHECK-OBJDUMP-NEXT: movq -0x8(%rbp), %r13
--- /dev/null
+# This checks that shrink wrapping correctly drops moving push/pops when
+# there is an LEA instruction.
+
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN: %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: ld.lld %t.o -o %t.exe -q
+# RUN: llvm-bolt %t.exe -relocs -o %t.out -data %t.fdata \
+# RUN: -frame-opt=all -simplify-conditional-tail-calls=false \
+# RUN: -experimental-shrink-wrapping \
+# RUN: -eliminate-unreachable=false | FileCheck %s
+# RUN: llvm-objdump -d %t.out --print-imm-hex | \
+# RUN: FileCheck --check-prefix CHECK-OBJDUMP %s
+
+ .globl _start
+ .type _start, %function
+_start:
+ .cfi_startproc
+# FDATA: 0 [unknown] 0 1 _start 0 0 1
+ push %rbp
+ mov %rsp, %rbp
+ push %rbx
+ push %r14
+ subq $0x20, %rsp
+ je b
+c:
+ addq $0x20, %rsp
+ pop %r14
+ pop %rbx
+ pop %rbp
+ ret
+b:
+ je f
+ jmp *JT(,%rdi,8)
+d:
+ mov %r14, %rdi
+ mov %rbx, %rdi
+ leaq -0x20(%rbp), %r14
+ movq -0x20(%rbp), %rdi
+f:
+ addq $0x20, %rsp
+ pop %r14
+ pop %rbx
+ pop %rbp
+ ret
+ .cfi_endproc
+ .size _start, .-_start
+ .data
+JT:
+ .quad c
+ .quad d
+ .quad f
+
+
+# CHECK: BOLT-INFO: Shrink wrapping moved 2 spills inserting load/stores and 0 spills inserting push/pops
+
+# Checks that offsets of instructions accessing the stack were not changed
+# CHECK-OBJDUMP: <_start>:
+# CHECK-OBJDUMP: movq %rbx, %rdi
+# CHECK-OBJDUMP-NEXT: leaq -0x20(%rbp), %r14
+# CHECK-OBJDUMP: movq -0x20(%rbp), %rdi
--- /dev/null
+# This checks that shrink wrapping correctly drops moving push/pops when
+# there is a MOV instruction loading the value of the stack pointer in
+# order to do pointer arithmetic with a stack address.
+
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN: %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: ld.lld %t.o -o %t.exe -q
+# RUN: llvm-bolt %t.exe -relocs -o %t.out -data %t.fdata \
+# RUN: -frame-opt=all -simplify-conditional-tail-calls=false \
+# RUN: -experimental-shrink-wrapping \
+# RUN: -eliminate-unreachable=false | FileCheck %s
+
+ .globl _start
+ .type _start, %function
+_start:
+ .cfi_startproc
+# FDATA: 0 [unknown] 0 1 _start 0 0 1
+ push %rbp
+ mov %rsp, %rbp
+ push %rbx
+ push %r14
+ subq $0x20, %rsp
+ je b
+c:
+ addq $0x20, %rsp
+ pop %r14
+ pop %rbx
+ pop %rbp
+ ret
+b:
+ je f
+ jmp *JT(,%rdi,8)
+d:
+ mov %r14, %rdi
+ mov %rbx, %rdi
+ mov %rbp, %rdi
+ sub 0x20, %rdi
+f:
+ addq $0x20, %rsp
+ pop %r14
+ pop %rbx
+ pop %rbp
+ ret
+ .cfi_endproc
+ .size _start, .-_start
+ .data
+JT:
+ .quad c
+ .quad d
+ .quad f
+
+
+# CHECK: BOLT-INFO: Shrink wrapping moved 2 spills inserting load/stores and 0 spills inserting push/pops
pushf
push %rbx
push %rbp
+ addq $0x50, -0x30(%rbp)
pop %rbp
pop %rbx
popf
--- /dev/null
+# This checks that shrink wrapping uses the red zone defined in the X86 ABI by
+# placing restores that access elements already deallocated by the stack.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN: %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
+# RUN: llvm-bolt -relocs %t.exe -o %t.out -data %t.fdata \
+# RUN: -frame-opt=all -simplify-conditional-tail-calls=false \
+# RUN: -experimental-shrink-wrapping \
+# RUN: -eliminate-unreachable=false | FileCheck %s
+# RUN: llvm-objdump -d %t.out --print-imm-hex | \
+# RUN: FileCheck --check-prefix CHECK-OBJDUMP %s
+
+
+# Here we create a CFG where the restore position matches the previous (deleted)
+# restore position. Shrink wrapping then will put a stack access to an element
+# that was deallocated at the previously deleted POP, which falls in the red
+# zone and should be safe for X86 Linux ABI.
+ .globl _start
+ .type _start, %function
+_start:
+ .cfi_startproc
+# FDATA: 0 [unknown] 0 1 _start 0 0 1
+ push %rbp
+ mov %rsp, %rbp
+ push %rbx
+ push %r14
+ subq $0x20, %rsp
+b: je hot_path
+# FDATA: 1 _start #b# 1 _start #hot_path# 0 1
+cold_path:
+ mov %r14, %rdi
+ mov %rbx, %rdi
+ movq rel(%rip), %rdi # Add this to create a relocation and run bolt w/ relocs
+ leaq -0x20(%rbp), %r14
+ movq -0x20(%rbp), %rdi
+ leaq -0x10(%rbp), %rsp
+ pop %r14
+ pop %rbx
+ pop %rbp
+ ret
+hot_path:
+ addq $0x20, %rsp
+ pop %r14
+ pop %rbx
+ pop %rbp
+ ret
+ .cfi_endproc
+ .size _start, .-_start
+
+ .data
+rel: .quad cold_path
+
+# CHECK: BOLT-INFO: Shrink wrapping moved 2 spills inserting load/stores and 0 spills inserting push/pops
+
+# CHECK-OBJDUMP: <_start>:
+# CHECK-OBJDUMP: leaq (%rbp), %rsp
+# CHECK-OBJDUMP-NEXT: popq %rbp
+# CHECK-OBJDUMP-NEXT: movq -0x10(%rsp), %rbx
+# CHECK-OBJDUMP-NEXT: movq -0x18(%rsp), %r14
+# CHECK-OBJDUMP-NEXT: retq