From ed74304506a0c97860bad5c91b0b7ccb59ab181d Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 24 Jun 2022 16:51:46 -0700 Subject: [PATCH] [BOLT] Fix EH trampoline backout code When SplitFunctions pass adds a trampoline code for exception landing pads (limited to shared objects), it may increase the size of the hot fragment making it larger than the whole function pre-split. When this happens, the pass reverts the splitting action by restoring the original block order and marking all blocks hot. However, if createEHTrampolines() added new blocks to the CFG and modified invoke instructions, simply restoring the original block layout will not suffice as the new CFG has more blocks. For proper backout of the split, modify the original layout by merging in trampoline blocks immediately before their matching targets. As a result, the number of blocks increases, but the number of instructions and the function size remains the same as pre-split. Add an assertion for the number of blocks when updating a function layout. Reviewed By: rafauler Differential Revision: https://reviews.llvm.org/D128696 --- bolt/include/bolt/Core/BinaryFunction.h | 3 +- bolt/include/bolt/Passes/SplitFunctions.h | 14 +- bolt/lib/Passes/SplitFunctions.cpp | 43 ++- .../X86/Inputs/pie-exceptions-failed-split.s | 293 +++++++++++++++++++++ .../runtime/X86/pie-exceptions-failed-split.test | 36 +++ 5 files changed, 379 insertions(+), 10 deletions(-) create mode 100644 bolt/test/runtime/X86/Inputs/pie-exceptions-failed-split.s create mode 100644 bolt/test/runtime/X86/pie-exceptions-failed-split.test diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index 3a84b26..dbea712 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -887,8 +887,9 @@ public: /// Update layout of basic blocks used for output. void updateBasicBlockLayout(BasicBlockOrderType &NewLayout) { - BasicBlocksPreviousLayout = BasicBlocksLayout; + assert(NewLayout.size() == BasicBlocks.size() && "Layout size mismatch."); + BasicBlocksPreviousLayout = BasicBlocksLayout; if (NewLayout != BasicBlocksLayout) { ModifiedLayout = true; BasicBlocksLayout.clear(); diff --git a/bolt/include/bolt/Passes/SplitFunctions.h b/bolt/include/bolt/Passes/SplitFunctions.h index 287ca58..84ff02a 100644 --- a/bolt/include/bolt/Passes/SplitFunctions.h +++ b/bolt/include/bolt/Passes/SplitFunctions.h @@ -23,12 +23,24 @@ private: template void splitFunction(BinaryFunction &Function, SplitStrategy Strategy = {}); + /// Map basic block labels to their trampoline block labels. + using TrampolineSetType = DenseMap; + + using BasicBlockOrderType = BinaryFunction::BasicBlockOrderType; + /// Create trampoline landing pads for exception handling code to guarantee /// that every landing pad is placed in the same function fragment as the /// corresponding thrower block. The trampoline landing pad, when created, /// will redirect the execution to the real landing pad in a different /// fragment. - void createEHTrampolines(BinaryFunction &Function) const; + TrampolineSetType createEHTrampolines(BinaryFunction &Function) const; + + /// Merge trampolines into \p Layout without trampolines. The merge will place + /// a trampoline immediately before its destination. Used to revert the effect + /// of trampolines after createEHTrampolines(). + BasicBlockOrderType + mergeEHTrampolines(BinaryFunction &BF, BasicBlockOrderType &Layout, + const TrampolineSetType &Trampolines) const; std::atomic SplitBytesHot{0ull}; std::atomic SplitBytesCold{0ull}; diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp index 4132bdf..a0cba7c 100644 --- a/bolt/lib/Passes/SplitFunctions.cpp +++ b/bolt/lib/Passes/SplitFunctions.cpp @@ -281,11 +281,12 @@ void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy Strategy) { // Separate hot from cold starting from the bottom. Strategy.partition(BF.layout_rbegin(), BF.layout_rend()); - // For shared objects, place invoke instructions and corresponding landing - // pads in the same fragment. To reduce hot code size, create trampoline - // landing pads that will redirect the execution to the real LP. + // For shared objects, invoke instructions and corresponding landing pads + // have to be placed in the same fragment. When we split them, create + // trampoline landing pads that will redirect the execution to real LPs. + TrampolineSetType Trampolines; if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit()) - createEHTrampolines(BF); + Trampolines = createEHTrampolines(BF); // Check the new size to see if it's worth splitting the function. if (BC.isX86() && BF.isSplit()) { @@ -300,6 +301,12 @@ void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy Strategy) { << Twine::utohexstr(ColdSize) << " -> 0x" << Twine::utohexstr(OriginalHotSize) << '\n'); + // Reverse the action of createEHTrampolines(). The trampolines will be + // placed immediately before the matching destination resulting in no + // extra code. + if (PreSplitLayout.size() != BF.size()) + PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines); + BF.updateBasicBlockLayout(PreSplitLayout); for (BinaryBasicBlock &BB : BF) BB.setIsCold(false); @@ -310,11 +317,12 @@ void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy Strategy) { } } -void SplitFunctions::createEHTrampolines(BinaryFunction &BF) const { +SplitFunctions::TrampolineSetType +SplitFunctions::createEHTrampolines(BinaryFunction &BF) const { const auto &MIB = BF.getBinaryContext().MIB; // Map real landing pads to the corresponding trampolines. - std::unordered_map LPTrampolines; + TrampolineSetType LPTrampolines; // Iterate over the copy of basic blocks since we are adding new blocks to the // function which will invalidate its iterators. @@ -344,7 +352,7 @@ void SplitFunctions::createEHTrampolines(BinaryFunction &BF) const { TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount()); TrampolineBB->setCFIState(LPBlock->getCFIState()); TrampolineLabel = TrampolineBB->getLabel(); - LPTrampolines.emplace(std::make_pair(LPLabel, TrampolineLabel)); + LPTrampolines.insert(std::make_pair(LPLabel, TrampolineLabel)); } // Substitute the landing pad with the trampoline. @@ -354,7 +362,7 @@ void SplitFunctions::createEHTrampolines(BinaryFunction &BF) const { } if (LPTrampolines.empty()) - return; + return LPTrampolines; // All trampoline blocks were added to the end of the function. Place them at // the end of corresponding fragments. @@ -368,6 +376,25 @@ void SplitFunctions::createEHTrampolines(BinaryFunction &BF) const { // Update exception-handling CFG for the function. BF.recomputeLandingPads(); + + return LPTrampolines; +} + +SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines( + BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout, + const SplitFunctions::TrampolineSetType &Trampolines) const { + BasicBlockOrderType MergedLayout; + for (BinaryBasicBlock *BB : Layout) { + auto Iter = Trampolines.find(BB->getLabel()); + if (Iter != Trampolines.end()) { + BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Iter->second); + assert(LPBlock && "Could not find matching landing pad block."); + MergedLayout.push_back(LPBlock); + } + MergedLayout.push_back(BB); + } + + return MergedLayout; } } // namespace bolt diff --git a/bolt/test/runtime/X86/Inputs/pie-exceptions-failed-split.s b/bolt/test/runtime/X86/Inputs/pie-exceptions-failed-split.s new file mode 100644 index 0000000..1ead293 --- /dev/null +++ b/bolt/test/runtime/X86/Inputs/pie-exceptions-failed-split.s @@ -0,0 +1,293 @@ +# Assembly generated from building the followingC++ code with the following +# command using trunk clang. Then, basic block at .LBB1_7 was moved before the +# landing pad. +# +# clang --target=x86_64-linux -O2 -fPIC -fno-inline exceptions-failed-split.cpp +# +# #include +# #include +# +# void bar(int a) { +# if (a > 2 && a % 2) +# throw new int(); +# } +# +# uint64_t throw_test(int argc, char **argv) { +# uint64_t rv = 0; +# +# if (argc == 99) +# return 0; +# +# uint64_t limit = (argc >= 2 ? 10 : 5000); +# for (uint64_t i = 0; i < limit; ++i) { +# rv += i; +# try { +# bar(argc); +# } catch (...) { +# } +# } +# +# if (argc == 5) +# return 0; +# +# if (argc == 7) +# return 0; +# +# if (argc >= 103 && argc <= 203) +# return 0; +# +# if (*argv == 0) +# return 0; +# +# if (argc >= 13 && argc <= 23) +# return 0; +# +# return rv; +# } +# +# int main(int argc, char **argv) { +# return !throw_test(argc, argv); +# } + + .text + .file "exceptions-failed-split.cpp" + .globl _Z3bari # -- Begin function _Z3bari + .p2align 4, 0x90 + .type _Z3bari,@function +_Z3bari: # @_Z3bari +.Lfunc_begin0: + .cfi_startproc + .cfi_personality 155, DW.ref.__gxx_personality_v0 + .cfi_lsda 27, .Lexception0 +# %bb.0: # %entry + pushq %r14 + .cfi_def_cfa_offset 16 + pushq %rbx + .cfi_def_cfa_offset 24 + pushq %rax + .cfi_def_cfa_offset 32 + .cfi_offset %rbx, -24 + .cfi_offset %r14, -16 + cmpl $3, %edi + jl .LBB0_5 +# %bb.1: # %entry + andl $1, %edi + jne .LBB0_2 +.LBB0_5: # %if.end + addq $8, %rsp + .cfi_def_cfa_offset 24 + popq %rbx + .cfi_def_cfa_offset 16 + popq %r14 + .cfi_def_cfa_offset 8 + retq +.LBB0_2: # %if.then + .cfi_def_cfa_offset 32 + movl $8, %edi + callq __cxa_allocate_exception@PLT + movq %rax, %rbx +.Ltmp0: + movl $4, %edi + callq _Znwm@PLT +.Ltmp1: +# %bb.3: # %invoke.cont + movl $0, (%rax) + movq %rax, (%rbx) + movq _ZTIPi@GOTPCREL(%rip), %rsi + movq %rbx, %rdi + xorl %edx, %edx + callq __cxa_throw@PLT +.LBB0_4: # %lpad +.Ltmp2: + movq %rax, %r14 + movq %rbx, %rdi + callq __cxa_free_exception@PLT + movq %r14, %rdi + callq _Unwind_Resume@PLT +.Lfunc_end0: + .size _Z3bari, .Lfunc_end0-_Z3bari + .cfi_endproc + .section .gcc_except_table,"a",@progbits + .p2align 2 +GCC_except_table0: +.Lexception0: + .byte 255 # @LPStart Encoding = omit + .byte 255 # @TType Encoding = omit + .byte 1 # Call site Encoding = uleb128 + .uleb128 .Lcst_end0-.Lcst_begin0 +.Lcst_begin0: + .uleb128 .Lfunc_begin0-.Lfunc_begin0 # >> Call Site 1 << + .uleb128 .Ltmp0-.Lfunc_begin0 # Call between .Lfunc_begin0 and .Ltmp0 + .byte 0 # has no landing pad + .byte 0 # On action: cleanup + .uleb128 .Ltmp0-.Lfunc_begin0 # >> Call Site 2 << + .uleb128 .Ltmp1-.Ltmp0 # Call between .Ltmp0 and .Ltmp1 + .uleb128 .Ltmp2-.Lfunc_begin0 # jumps to .Ltmp2 + .byte 0 # On action: cleanup + .uleb128 .Ltmp1-.Lfunc_begin0 # >> Call Site 3 << + .uleb128 .Lfunc_end0-.Ltmp1 # Call between .Ltmp1 and .Lfunc_end0 + .byte 0 # has no landing pad + .byte 0 # On action: cleanup +.Lcst_end0: + .p2align 2 + # -- End function + .text + .globl _Z10throw_testiPPc # -- Begin function _Z10throw_testiPPc + .p2align 4, 0x90 + .type _Z10throw_testiPPc,@function +_Z10throw_testiPPc: # @_Z10throw_testiPPc +.Lfunc_begin1: + .cfi_startproc + .cfi_personality 155, DW.ref.__gxx_personality_v0 + .cfi_lsda 27, .Lexception1 +# %bb.0: # %entry + pushq %r15 + .cfi_def_cfa_offset 16 + pushq %r14 + .cfi_def_cfa_offset 24 + pushq %r13 + .cfi_def_cfa_offset 32 + pushq %r12 + .cfi_def_cfa_offset 40 + pushq %rbx + .cfi_def_cfa_offset 48 + .cfi_offset %rbx, -48 + .cfi_offset %r12, -40 + .cfi_offset %r13, -32 + .cfi_offset %r14, -24 + .cfi_offset %r15, -16 + cmpl $99, %edi + je .LBB1_7 +# %bb.2: # %if.end + movq %rsi, %r15 + movl %edi, %r14d + cmpl $2, %edi + movl $10, %eax + movl $5000, %r12d # imm = 0x1388 + cmovgeq %rax, %r12 + xorl %r13d, %r13d + xorl %ebx, %ebx + .p2align 4, 0x90 +.LBB1_3: # %for.body + # =>This Inner Loop Header: Depth=1 +.Ltmp3: + movl %r14d, %edi + callq _Z3bari@PLT +.Ltmp4: +.LBB1_4: # %for.inc + # in Loop: Header=BB1_3 Depth=1 + addq %rbx, %r13 + incq %rbx + cmpq %rbx, %r12 + jne .LBB1_3 + jmp .LBB1_6 +.LBB1_7: + xorl %r13d, %r13d + jmp .LBB1_8 +.LBB1_5: # %lpad + # in Loop: Header=BB1_3 Depth=1 +.Ltmp5: + movq %rax, %rdi + callq __cxa_begin_catch@PLT + callq __cxa_end_catch@PLT + jmp .LBB1_4 +.LBB1_6: # %for.cond.cleanup + movl %r14d, %eax + orl $2, %eax + cmpl $7, %eax + jne .LBB1_9 + jmp .LBB1_7 +.LBB1_8: # %cleanup21 + movq %r13, %rax + popq %rbx + .cfi_def_cfa_offset 40 + popq %r12 + .cfi_def_cfa_offset 32 + popq %r13 + .cfi_def_cfa_offset 24 + popq %r14 + .cfi_def_cfa_offset 16 + popq %r15 + .cfi_def_cfa_offset 8 + retq +.LBB1_9: # %if.end8 + .cfi_def_cfa_offset 48 + leal -103(%r14), %eax + cmpl $101, %eax + jb .LBB1_7 +# %bb.11: # %if.end12 + cmpq $0, (%r15) + je .LBB1_7 +# %bb.12: # %if.end15 + addl $-13, %r14d + xorl %eax, %eax + cmpl $11, %r14d + cmovbq %rax, %r13 + jmp .LBB1_8 +.Lfunc_end1: + .size _Z10throw_testiPPc, .Lfunc_end1-_Z10throw_testiPPc + .cfi_endproc + .section .gcc_except_table,"a",@progbits + .p2align 2 +GCC_except_table1: +.Lexception1: + .byte 255 # @LPStart Encoding = omit + .byte 155 # @TType Encoding = indirect pcrel sdata4 + .uleb128 .Lttbase0-.Lttbaseref0 +.Lttbaseref0: + .byte 1 # Call site Encoding = uleb128 + .uleb128 .Lcst_end1-.Lcst_begin1 +.Lcst_begin1: + .uleb128 .Ltmp3-.Lfunc_begin1 # >> Call Site 1 << + .uleb128 .Ltmp4-.Ltmp3 # Call between .Ltmp3 and .Ltmp4 + .uleb128 .Ltmp5-.Lfunc_begin1 # jumps to .Ltmp5 + .byte 1 # On action: 1 + .uleb128 .Ltmp4-.Lfunc_begin1 # >> Call Site 2 << + .uleb128 .Lfunc_end1-.Ltmp4 # Call between .Ltmp4 and .Lfunc_end1 + .byte 0 # has no landing pad + .byte 0 # On action: cleanup +.Lcst_end1: + .byte 1 # >> Action Record 1 << + # Catch TypeInfo 1 + .byte 0 # No further actions + .p2align 2 + # >> Catch TypeInfos << + .long 0 # TypeInfo 1 +.Lttbase0: + .p2align 2 + # -- End function + .text + .globl main # -- Begin function main + .p2align 4, 0x90 + .type main,@function +main: # @main + .cfi_startproc +# %bb.0: # %entry + pushq %rax + .cfi_def_cfa_offset 16 + callq _Z10throw_testiPPc@PLT + xorl %ecx, %ecx + testq %rax, %rax + sete %cl + movl %ecx, %eax + popq %rcx + .cfi_def_cfa_offset 8 + retq +.Lfunc_end2: + .size main, .Lfunc_end2-main + .cfi_endproc + # -- End function + .hidden DW.ref.__gxx_personality_v0 + .weak DW.ref.__gxx_personality_v0 + .section .data.DW.ref.__gxx_personality_v0,"aGw",@progbits,DW.ref.__gxx_personality_v0,comdat + .p2align 3 + .type DW.ref.__gxx_personality_v0,@object + .size DW.ref.__gxx_personality_v0, 8 +DW.ref.__gxx_personality_v0: + .quad __gxx_personality_v0 + .ident "clang version 15.0.0" + .section ".note.GNU-stack","",@progbits + .addrsig + .addrsig_sym __gxx_personality_v0 + .addrsig_sym _Unwind_Resume + .addrsig_sym _ZTIPi diff --git a/bolt/test/runtime/X86/pie-exceptions-failed-split.test b/bolt/test/runtime/X86/pie-exceptions-failed-split.test new file mode 100644 index 0000000..847647e --- /dev/null +++ b/bolt/test/runtime/X86/pie-exceptions-failed-split.test @@ -0,0 +1,36 @@ +REQUIRES: system-linux + +RUN: %clangxx %cxxflags -pie -fPIC %p/Inputs/pie-exceptions-failed-split.s \ +RUN: -Wl,-q -o %t +RUN: llvm-bolt %t -o %t.instr --instrument --instrumentation-file=%t.fdata + +## Record profile with invocation that does not throw exceptions. +RUN: %t.instr + +RUN: llvm-bolt %t -o %t.bolt --data %t.fdata --reorder-blocks=ext-tsp \ +RUN: --split-functions --split-eh --print-after-lowering \ +RUN: --print-only=_Z10throw_testiPPc 2>&1 | FileCheck %s + +## Hot code in the test case gets larger after splitting because of jump +## instruction relaxation. Check that BOLT reverts the split correctly. +CHECK: Binary Function "_Z10throw_testiPPc" +CHECK: IsSplit : +CHECK-SAME: 0 + +## Check that the landing pad trampoline was created, but contains no +## instructions and falls to the real landing pad. +CHECK: {{^[^[:space:]]+}} (0 instructions +CHECK-NEXT: Landing Pad{{$}} +CHECK: Exec Count +CHECK-SAME: : 0 +CHECK: Successors: +CHECK-SAME: [[LP:[^[:space:]]+]] +CHECK-EMPTY: +CHECK-NEXT: [[LP]] +CHECK-DAG: Exec Count +CHECK-NOT: Exec Count +CHECK-DAG: callq __cxa_begin_catch + +## Verify the output still executes correctly when the exception path is being +## taken. +RUN: %t.bolt arg1 arg2 arg3 -- 2.7.4