[BOLT] Fix EH trampoline backout code
authorMaksim Panchenko <maks@fb.com>
Fri, 24 Jun 2022 23:51:46 +0000 (16:51 -0700)
committerMaksim Panchenko <maks@fb.com>
Wed, 29 Jun 2022 21:35:57 +0000 (14:35 -0700)
When SplitFunctions pass adds a trampoline code for exception landing
pads (limited to shared objects), it may increase the size of the hot
fragment making it larger than the whole function pre-split. When this
happens, the pass reverts the splitting action by restoring the original
block order and marking all blocks hot.

However, if createEHTrampolines() added new blocks to the CFG and
modified invoke instructions, simply restoring the original block layout
will not suffice as the new CFG has more blocks.

For proper backout of the split, modify the original layout by merging
in trampoline blocks immediately before their matching targets. As a
result, the number of blocks increases, but the number of instructions
and the function size remains the same as pre-split.

Add an assertion for the number of blocks when updating a function
layout.

Reviewed By: rafauler

Differential Revision: https://reviews.llvm.org/D128696

bolt/include/bolt/Core/BinaryFunction.h
bolt/include/bolt/Passes/SplitFunctions.h
bolt/lib/Passes/SplitFunctions.cpp
bolt/test/runtime/X86/Inputs/pie-exceptions-failed-split.s [new file with mode: 0644]
bolt/test/runtime/X86/pie-exceptions-failed-split.test [new file with mode: 0644]

index 3a84b26..dbea712 100644 (file)
@@ -887,8 +887,9 @@ public:
 
   /// Update layout of basic blocks used for output.
   void updateBasicBlockLayout(BasicBlockOrderType &NewLayout) {
-    BasicBlocksPreviousLayout = BasicBlocksLayout;
+    assert(NewLayout.size() == BasicBlocks.size() && "Layout size mismatch.");
 
+    BasicBlocksPreviousLayout = BasicBlocksLayout;
     if (NewLayout != BasicBlocksLayout) {
       ModifiedLayout = true;
       BasicBlocksLayout.clear();
index 287ca58..84ff02a 100644 (file)
@@ -23,12 +23,24 @@ private:
   template <typename SplitStrategy>
   void splitFunction(BinaryFunction &Function, SplitStrategy Strategy = {});
 
+  /// Map basic block labels to their trampoline block labels.
+  using TrampolineSetType = DenseMap<const MCSymbol *, const MCSymbol *>;
+
+  using BasicBlockOrderType = BinaryFunction::BasicBlockOrderType;
+
   /// Create trampoline landing pads for exception handling code to guarantee
   /// that every landing pad is placed in the same function fragment as the
   /// corresponding thrower block. The trampoline landing pad, when created,
   /// will redirect the execution to the real landing pad in a different
   /// fragment.
-  void createEHTrampolines(BinaryFunction &Function) const;
+  TrampolineSetType createEHTrampolines(BinaryFunction &Function) const;
+
+  /// Merge trampolines into \p Layout without trampolines. The merge will place
+  /// a trampoline immediately before its destination. Used to revert the effect
+  /// of trampolines after createEHTrampolines().
+  BasicBlockOrderType
+  mergeEHTrampolines(BinaryFunction &BF, BasicBlockOrderType &Layout,
+                     const TrampolineSetType &Trampolines) const;
 
   std::atomic<uint64_t> SplitBytesHot{0ull};
   std::atomic<uint64_t> SplitBytesCold{0ull};
index 4132bdf..a0cba7c 100644 (file)
@@ -281,11 +281,12 @@ void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy Strategy) {
   // Separate hot from cold starting from the bottom.
   Strategy.partition(BF.layout_rbegin(), BF.layout_rend());
 
-  // For shared objects, place invoke instructions and corresponding landing
-  // pads in the same fragment. To reduce hot code size, create trampoline
-  // landing pads that will redirect the execution to the real LP.
+  // For shared objects, invoke instructions and corresponding landing pads
+  // have to be placed in the same fragment. When we split them, create
+  // trampoline landing pads that will redirect the execution to real LPs.
+  TrampolineSetType Trampolines;
   if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit())
-    createEHTrampolines(BF);
+    Trampolines = createEHTrampolines(BF);
 
   // Check the new size to see if it's worth splitting the function.
   if (BC.isX86() && BF.isSplit()) {
@@ -300,6 +301,12 @@ void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy Strategy) {
                         << Twine::utohexstr(ColdSize) << " -> 0x"
                         << Twine::utohexstr(OriginalHotSize) << '\n');
 
+      // Reverse the action of createEHTrampolines(). The trampolines will be
+      // placed immediately before the matching destination resulting in no
+      // extra code.
+      if (PreSplitLayout.size() != BF.size())
+        PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines);
+
       BF.updateBasicBlockLayout(PreSplitLayout);
       for (BinaryBasicBlock &BB : BF)
         BB.setIsCold(false);
@@ -310,11 +317,12 @@ void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy Strategy) {
   }
 }
 
-void SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
+SplitFunctions::TrampolineSetType
+SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
   const auto &MIB = BF.getBinaryContext().MIB;
 
   // Map real landing pads to the corresponding trampolines.
-  std::unordered_map<const MCSymbol *, const MCSymbol *> LPTrampolines;
+  TrampolineSetType LPTrampolines;
 
   // Iterate over the copy of basic blocks since we are adding new blocks to the
   // function which will invalidate its iterators.
@@ -344,7 +352,7 @@ void SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
         TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount());
         TrampolineBB->setCFIState(LPBlock->getCFIState());
         TrampolineLabel = TrampolineBB->getLabel();
-        LPTrampolines.emplace(std::make_pair(LPLabel, TrampolineLabel));
+        LPTrampolines.insert(std::make_pair(LPLabel, TrampolineLabel));
       }
 
       // Substitute the landing pad with the trampoline.
@@ -354,7 +362,7 @@ void SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
   }
 
   if (LPTrampolines.empty())
-    return;
+    return LPTrampolines;
 
   // All trampoline blocks were added to the end of the function. Place them at
   // the end of corresponding fragments.
@@ -368,6 +376,25 @@ void SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
 
   // Update exception-handling CFG for the function.
   BF.recomputeLandingPads();
+
+  return LPTrampolines;
+}
+
+SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines(
+    BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout,
+    const SplitFunctions::TrampolineSetType &Trampolines) const {
+  BasicBlockOrderType MergedLayout;
+  for (BinaryBasicBlock *BB : Layout) {
+    auto Iter = Trampolines.find(BB->getLabel());
+    if (Iter != Trampolines.end()) {
+      BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Iter->second);
+      assert(LPBlock && "Could not find matching landing pad block.");
+      MergedLayout.push_back(LPBlock);
+    }
+    MergedLayout.push_back(BB);
+  }
+
+  return MergedLayout;
 }
 
 } // namespace bolt
diff --git a/bolt/test/runtime/X86/Inputs/pie-exceptions-failed-split.s b/bolt/test/runtime/X86/Inputs/pie-exceptions-failed-split.s
new file mode 100644 (file)
index 0000000..1ead293
--- /dev/null
@@ -0,0 +1,293 @@
+# Assembly generated from building the followingC++ code with the following
+# command using trunk clang. Then, basic block at .LBB1_7 was moved before the
+# landing pad.
+#
+# clang --target=x86_64-linux -O2 -fPIC -fno-inline exceptions-failed-split.cpp
+#
+#              #include <stdio.h>
+#              #include <stdint.h>
+#
+#              void bar(int a) {
+#                      if (a > 2 && a % 2)
+#                              throw new int();
+#              }
+#
+#              uint64_t throw_test(int argc, char **argv) {
+#                      uint64_t rv = 0;
+#
+#                      if (argc == 99)
+#                              return 0;
+#
+#                      uint64_t limit = (argc >= 2 ? 10 : 5000);
+#                      for (uint64_t i = 0; i < limit; ++i) {
+#                              rv += i;
+#                              try  {
+#                                      bar(argc);
+#                              } catch (...) {
+#                              }
+#                      }
+#
+#                      if (argc == 5)
+#                              return 0;
+#
+#                      if (argc == 7)
+#                              return 0;
+#
+#                      if (argc >= 103 && argc <= 203)
+#                              return 0;
+#
+#                      if (*argv == 0)
+#                              return 0;
+#
+#                      if (argc >= 13 && argc <= 23)
+#                              return 0;
+#
+#                      return rv;
+#              }
+#
+#              int main(int argc, char **argv) {
+#                      return !throw_test(argc, argv);
+#              }
+
+       .text
+       .file   "exceptions-failed-split.cpp"
+       .globl  _Z3bari                         # -- Begin function _Z3bari
+       .p2align        4, 0x90
+       .type   _Z3bari,@function
+_Z3bari:                                # @_Z3bari
+.Lfunc_begin0:
+       .cfi_startproc
+       .cfi_personality 155, DW.ref.__gxx_personality_v0
+       .cfi_lsda 27, .Lexception0
+# %bb.0:                                # %entry
+       pushq   %r14
+       .cfi_def_cfa_offset 16
+       pushq   %rbx
+       .cfi_def_cfa_offset 24
+       pushq   %rax
+       .cfi_def_cfa_offset 32
+       .cfi_offset %rbx, -24
+       .cfi_offset %r14, -16
+       cmpl    $3, %edi
+       jl      .LBB0_5
+# %bb.1:                                # %entry
+       andl    $1, %edi
+       jne     .LBB0_2
+.LBB0_5:                                # %if.end
+       addq    $8, %rsp
+       .cfi_def_cfa_offset 24
+       popq    %rbx
+       .cfi_def_cfa_offset 16
+       popq    %r14
+       .cfi_def_cfa_offset 8
+       retq
+.LBB0_2:                                # %if.then
+       .cfi_def_cfa_offset 32
+       movl    $8, %edi
+       callq   __cxa_allocate_exception@PLT
+       movq    %rax, %rbx
+.Ltmp0:
+       movl    $4, %edi
+       callq   _Znwm@PLT
+.Ltmp1:
+# %bb.3:                                # %invoke.cont
+       movl    $0, (%rax)
+       movq    %rax, (%rbx)
+       movq    _ZTIPi@GOTPCREL(%rip), %rsi
+       movq    %rbx, %rdi
+       xorl    %edx, %edx
+       callq   __cxa_throw@PLT
+.LBB0_4:                                # %lpad
+.Ltmp2:
+       movq    %rax, %r14
+       movq    %rbx, %rdi
+       callq   __cxa_free_exception@PLT
+       movq    %r14, %rdi
+       callq   _Unwind_Resume@PLT
+.Lfunc_end0:
+       .size   _Z3bari, .Lfunc_end0-_Z3bari
+       .cfi_endproc
+       .section        .gcc_except_table,"a",@progbits
+       .p2align        2
+GCC_except_table0:
+.Lexception0:
+       .byte   255                             # @LPStart Encoding = omit
+       .byte   255                             # @TType Encoding = omit
+       .byte   1                               # Call site Encoding = uleb128
+       .uleb128 .Lcst_end0-.Lcst_begin0
+.Lcst_begin0:
+       .uleb128 .Lfunc_begin0-.Lfunc_begin0    # >> Call Site 1 <<
+       .uleb128 .Ltmp0-.Lfunc_begin0           #   Call between .Lfunc_begin0 and .Ltmp0
+       .byte   0                               #     has no landing pad
+       .byte   0                               #   On action: cleanup
+       .uleb128 .Ltmp0-.Lfunc_begin0           # >> Call Site 2 <<
+       .uleb128 .Ltmp1-.Ltmp0                  #   Call between .Ltmp0 and .Ltmp1
+       .uleb128 .Ltmp2-.Lfunc_begin0           #     jumps to .Ltmp2
+       .byte   0                               #   On action: cleanup
+       .uleb128 .Ltmp1-.Lfunc_begin0           # >> Call Site 3 <<
+       .uleb128 .Lfunc_end0-.Ltmp1             #   Call between .Ltmp1 and .Lfunc_end0
+       .byte   0                               #     has no landing pad
+       .byte   0                               #   On action: cleanup
+.Lcst_end0:
+       .p2align        2
+                                        # -- End function
+       .text
+       .globl  _Z10throw_testiPPc              # -- Begin function _Z10throw_testiPPc
+       .p2align        4, 0x90
+       .type   _Z10throw_testiPPc,@function
+_Z10throw_testiPPc:                     # @_Z10throw_testiPPc
+.Lfunc_begin1:
+       .cfi_startproc
+       .cfi_personality 155, DW.ref.__gxx_personality_v0
+       .cfi_lsda 27, .Lexception1
+# %bb.0:                                # %entry
+       pushq   %r15
+       .cfi_def_cfa_offset 16
+       pushq   %r14
+       .cfi_def_cfa_offset 24
+       pushq   %r13
+       .cfi_def_cfa_offset 32
+       pushq   %r12
+       .cfi_def_cfa_offset 40
+       pushq   %rbx
+       .cfi_def_cfa_offset 48
+       .cfi_offset %rbx, -48
+       .cfi_offset %r12, -40
+       .cfi_offset %r13, -32
+       .cfi_offset %r14, -24
+       .cfi_offset %r15, -16
+       cmpl    $99, %edi
+       je      .LBB1_7
+# %bb.2:                                # %if.end
+       movq    %rsi, %r15
+       movl    %edi, %r14d
+       cmpl    $2, %edi
+       movl    $10, %eax
+       movl    $5000, %r12d                    # imm = 0x1388
+       cmovgeq %rax, %r12
+       xorl    %r13d, %r13d
+       xorl    %ebx, %ebx
+       .p2align        4, 0x90
+.LBB1_3:                                # %for.body
+                                        # =>This Inner Loop Header: Depth=1
+.Ltmp3:
+       movl    %r14d, %edi
+       callq   _Z3bari@PLT
+.Ltmp4:
+.LBB1_4:                                # %for.inc
+                                        #   in Loop: Header=BB1_3 Depth=1
+       addq    %rbx, %r13
+       incq    %rbx
+       cmpq    %rbx, %r12
+       jne     .LBB1_3
+       jmp     .LBB1_6
+.LBB1_7:
+       xorl    %r13d, %r13d
+  jmp .LBB1_8
+.LBB1_5:                                # %lpad
+                                        #   in Loop: Header=BB1_3 Depth=1
+.Ltmp5:
+       movq    %rax, %rdi
+       callq   __cxa_begin_catch@PLT
+       callq   __cxa_end_catch@PLT
+       jmp     .LBB1_4
+.LBB1_6:                                # %for.cond.cleanup
+       movl    %r14d, %eax
+       orl     $2, %eax
+       cmpl    $7, %eax
+       jne     .LBB1_9
+  jmp .LBB1_7
+.LBB1_8:                                # %cleanup21
+       movq    %r13, %rax
+       popq    %rbx
+       .cfi_def_cfa_offset 40
+       popq    %r12
+       .cfi_def_cfa_offset 32
+       popq    %r13
+       .cfi_def_cfa_offset 24
+       popq    %r14
+       .cfi_def_cfa_offset 16
+       popq    %r15
+       .cfi_def_cfa_offset 8
+       retq
+.LBB1_9:                                # %if.end8
+       .cfi_def_cfa_offset 48
+       leal    -103(%r14), %eax
+       cmpl    $101, %eax
+       jb      .LBB1_7
+# %bb.11:                               # %if.end12
+       cmpq    $0, (%r15)
+       je      .LBB1_7
+# %bb.12:                               # %if.end15
+       addl    $-13, %r14d
+       xorl    %eax, %eax
+       cmpl    $11, %r14d
+       cmovbq  %rax, %r13
+       jmp     .LBB1_8
+.Lfunc_end1:
+       .size   _Z10throw_testiPPc, .Lfunc_end1-_Z10throw_testiPPc
+       .cfi_endproc
+       .section        .gcc_except_table,"a",@progbits
+       .p2align        2
+GCC_except_table1:
+.Lexception1:
+       .byte   255                             # @LPStart Encoding = omit
+       .byte   155                             # @TType Encoding = indirect pcrel sdata4
+       .uleb128 .Lttbase0-.Lttbaseref0
+.Lttbaseref0:
+       .byte   1                               # Call site Encoding = uleb128
+       .uleb128 .Lcst_end1-.Lcst_begin1
+.Lcst_begin1:
+       .uleb128 .Ltmp3-.Lfunc_begin1           # >> Call Site 1 <<
+       .uleb128 .Ltmp4-.Ltmp3                  #   Call between .Ltmp3 and .Ltmp4
+       .uleb128 .Ltmp5-.Lfunc_begin1           #     jumps to .Ltmp5
+       .byte   1                               #   On action: 1
+       .uleb128 .Ltmp4-.Lfunc_begin1           # >> Call Site 2 <<
+       .uleb128 .Lfunc_end1-.Ltmp4             #   Call between .Ltmp4 and .Lfunc_end1
+       .byte   0                               #     has no landing pad
+       .byte   0                               #   On action: cleanup
+.Lcst_end1:
+       .byte   1                               # >> Action Record 1 <<
+                                        #   Catch TypeInfo 1
+       .byte   0                               #   No further actions
+       .p2align        2
+                                        # >> Catch TypeInfos <<
+       .long   0                               # TypeInfo 1
+.Lttbase0:
+       .p2align        2
+                                        # -- End function
+       .text
+       .globl  main                            # -- Begin function main
+       .p2align        4, 0x90
+       .type   main,@function
+main:                                   # @main
+       .cfi_startproc
+# %bb.0:                                # %entry
+       pushq   %rax
+       .cfi_def_cfa_offset 16
+       callq   _Z10throw_testiPPc@PLT
+       xorl    %ecx, %ecx
+       testq   %rax, %rax
+       sete    %cl
+       movl    %ecx, %eax
+       popq    %rcx
+       .cfi_def_cfa_offset 8
+       retq
+.Lfunc_end2:
+       .size   main, .Lfunc_end2-main
+       .cfi_endproc
+                                        # -- End function
+       .hidden DW.ref.__gxx_personality_v0
+       .weak   DW.ref.__gxx_personality_v0
+       .section        .data.DW.ref.__gxx_personality_v0,"aGw",@progbits,DW.ref.__gxx_personality_v0,comdat
+       .p2align        3
+       .type   DW.ref.__gxx_personality_v0,@object
+       .size   DW.ref.__gxx_personality_v0, 8
+DW.ref.__gxx_personality_v0:
+       .quad   __gxx_personality_v0
+       .ident  "clang version 15.0.0"
+       .section        ".note.GNU-stack","",@progbits
+       .addrsig
+       .addrsig_sym __gxx_personality_v0
+       .addrsig_sym _Unwind_Resume
+       .addrsig_sym _ZTIPi
diff --git a/bolt/test/runtime/X86/pie-exceptions-failed-split.test b/bolt/test/runtime/X86/pie-exceptions-failed-split.test
new file mode 100644 (file)
index 0000000..847647e
--- /dev/null
@@ -0,0 +1,36 @@
+REQUIRES: system-linux
+
+RUN: %clangxx %cxxflags -pie -fPIC %p/Inputs/pie-exceptions-failed-split.s \
+RUN:   -Wl,-q -o %t
+RUN: llvm-bolt %t -o %t.instr --instrument --instrumentation-file=%t.fdata
+
+## Record profile with invocation that does not throw exceptions.
+RUN: %t.instr
+
+RUN: llvm-bolt %t -o %t.bolt --data %t.fdata --reorder-blocks=ext-tsp \
+RUN:   --split-functions --split-eh --print-after-lowering \
+RUN:   --print-only=_Z10throw_testiPPc 2>&1 | FileCheck %s
+
+## Hot code in the test case gets larger after splitting because of jump
+## instruction relaxation. Check that BOLT reverts the split correctly.
+CHECK: Binary Function "_Z10throw_testiPPc"
+CHECK: IsSplit :
+CHECK-SAME: 0
+
+## Check that the landing pad trampoline was created, but contains no
+## instructions and falls to the real landing pad.
+CHECK: {{^[^[:space:]]+}} (0 instructions
+CHECK-NEXT: Landing Pad{{$}}
+CHECK: Exec Count
+CHECK-SAME: : 0
+CHECK: Successors:
+CHECK-SAME: [[LP:[^[:space:]]+]]
+CHECK-EMPTY:
+CHECK-NEXT: [[LP]]
+CHECK-DAG: Exec Count
+CHECK-NOT: Exec Count
+CHECK-DAG: callq   __cxa_begin_catch
+
+## Verify the output still executes correctly when the exception path is being
+## taken.
+RUN: %t.bolt arg1 arg2 arg3