[JITLink] Optimize GOTPCRELX Relocations

author luxufan <932494295@qq.com>

Thu, 19 Aug 2021 02:13:40 +0000 (10:13 +0800)

committer luxufan <932494295@qq.com>

Thu, 19 Aug 2021 02:30:22 +0000 (10:30 +0800)
author luxufan <932494295@qq.com>
Thu, 19 Aug 2021 02:13:40 +0000 (10:13 +0800)
committer luxufan <932494295@qq.com>
Thu, 19 Aug 2021 02:30:22 +0000 (10:30 +0800)
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h

index dbd8866..36e346c 100644 (file)
--- a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h
@@ -24,7 +24,8 @@ enum ELFX86RelocationKind : Edge::Kind {
    Pointer64,
    PCRel32,
    PCRel32GOTLoad,
-  PCRel32REXGOTLoad,
+  PCRel32GOTLoadRelaxable,
+  PCRel32REXGOTLoadRelaxable,
    PCRel64GOT,
    GOTOFF64,
    GOT64,
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp

index ec0077c..736bcad 100644 (file)
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
@@ -184,10 +184,11 @@ private:
      case ELF::R_X86_64_64:
        return ELF_x86_64_Edges::ELFX86RelocationKind::Pointer64;
      case ELF::R_X86_64_GOTPCREL:
-    case ELF::R_X86_64_GOTPCRELX:
        return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32GOTLoad;
+    case ELF::R_X86_64_GOTPCRELX:
+      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32GOTLoadRelaxable;
      case ELF::R_X86_64_REX_GOTPCRELX:
-      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32REXGOTLoad;
+      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32REXGOTLoadRelaxable;
      case ELF::R_X86_64_GOTPCREL64:
        return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel64GOT;
      case ELF::R_X86_64_GOT64:
@@ -301,15 +302,19 @@ private:
            Kind = x86_64::Pointer64;
            break;
          case PCRel32GOTLoad: {
-          Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable;
-          Addend = 0;
+          Kind = x86_64::RequestGOTAndTransformToDelta32;
            break;
          }
-        case PCRel32REXGOTLoad: {
+        case PCRel32REXGOTLoadRelaxable: {
            Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable;
            Addend = 0;
            break;
          }
+        case PCRel32GOTLoadRelaxable: {
+          Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable;
+          Addend = 0;
+          break;
+        }
          case PCRel64GOT: {
            Kind = x86_64::RequestGOTAndTransformToDelta64;
            break;
@@ -498,7 +503,9 @@ const char *getELFX86RelocationKindName(Edge::Kind R) {
      return "PCRel32";
    case PCRel32GOTLoad:
      return "PCRel32GOTLoad";
-  case PCRel32REXGOTLoad:
+  case PCRel32GOTLoadRelaxable:
+    return "PCRel32GOTLoadRelaxable";
+  case PCRel32REXGOTLoadRelaxable:
      return "PCRel32REXGOTLoad";
    case PCRel64GOT:
      return "PCRel64GOT";
diff --git a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp

index 354442c..a95b34b 100644 (file)
--- a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
@@ -73,42 +73,79 @@ Error optimize_x86_64_GOTAndStubs(LinkGraph &G) {
    LLVM_DEBUG(dbgs() << "Optimizing GOT entries and stubs:\n");
  
    for (auto *B : G.blocks())
-    for (auto &E : B->edges())
-      if (E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable) {
-        // Replace GOT load with LEA only for MOVQ instructions.
-        assert(E.getOffset() >= 3 && "GOT edge occurs too early in block");
-
-        constexpr uint8_t MOVQRIPRel[] = {0x48, 0x8b};
-        if (strncmp(B->getContent().data() + E.getOffset() - 3,
-                    reinterpret_cast<const char *>(MOVQRIPRel), 2) != 0)
-          continue;
-
-        auto &GOTBlock = E.getTarget().getBlock();
-        assert(GOTBlock.getSize() == G.getPointerSize() &&
+    for (auto &E : B->edges()) {
+      if (E.getKind() == x86_64::PCRel32GOTLoadRelaxable ||
+          E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable) {
+        bool REXPrefix = E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable;
+        assert(E.getOffset() >= (REXPrefix ? 3 : 2) &&
+               "GOT edge occurs too early in block");
+        auto *FixupData = reinterpret_cast<uint8_t *>(
+                              const_cast<char *>(B->getContent().data())) +
+                          E.getOffset();
+        const uint8_t Op = FixupData[-2];
+        const uint8_t ModRM = FixupData[-1];
+
+        auto &GOTEntryBlock = E.getTarget().getBlock();
+        assert(GOTEntryBlock.getSize() == G.getPointerSize() &&
                 "GOT entry block should be pointer sized");
-        assert(GOTBlock.edges_size() == 1 &&
+        assert(GOTEntryBlock.edges_size() == 1 &&
                 "GOT entry should only have one outgoing edge");
-
-        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
-        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
+        auto &GOTTarget = GOTEntryBlock.edges().begin()->getTarget();
          JITTargetAddress TargetAddr = GOTTarget.getAddress();
-
+        JITTargetAddress EdgeAddr = B->getFixupAddress(E);
          int64_t Displacement = TargetAddr - EdgeAddr + 4;
-        if (isInRangeForImmS32(Displacement)) {
-          // Change the edge kind as we don't go through GOT anymore. This is
-          // for formal correctness only. Technically, the two relocation kinds
-          // are resolved the same way.
+        bool TargetInRangeForImmU32 = isInRangeForImmU32(TargetAddr);
+        bool DisplacementInRangeForImmS32 = isInRangeForImmS32(Displacement);
+
+        // If both of the Target and displacement is out of range, then
+        // there isn't optimization chance.
+        if (!(TargetInRangeForImmU32 || DisplacementInRangeForImmS32))
+          continue;
+
+        // Transform "mov foo@GOTPCREL(%rip),%reg" to "lea foo(%rip),%reg".
+        if (Op == 0x8b && DisplacementInRangeForImmS32) {
+          FixupData[-2] = 0x8d;
            E.setKind(x86_64::Delta32);
            E.setTarget(GOTTarget);
            E.setAddend(E.getAddend() - 4);
-          auto *BlockData = reinterpret_cast<uint8_t *>(
-              const_cast<char *>(B->getContent().data()));
-          BlockData[E.getOffset() - 2] = 0x8d;
            LLVM_DEBUG({
              dbgs() << "  Replaced GOT load wih LEA:\n    ";
              printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind()));
              dbgs() << "\n";
            });
+          continue;
+        }
+
+        // Transform call/jmp instructions
+        if (Op == 0xff && TargetInRangeForImmU32) {
+          if (ModRM == 0x15) {
+            // ABI says we can convert "call *foo@GOTPCREL(%rip)" to "nop; call
+            // foo" But lld convert it to "addr32 call foo, because that makes
+            // result expression to be a single instruction.
+            FixupData[-2] = 0x67;
+            FixupData[-1] = 0xe8;
+            LLVM_DEBUG({
+              dbgs() << "  replaced call instruction's memory operand wih imm "
+                        "operand:\n    ";
+              printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind()));
+              dbgs() << "\n";
+            });
+          } else {
+            // Transform "jmp *foo@GOTPCREL(%rip)" to "jmp foo; nop"
+            assert(ModRM == 0x25 && "Invalid ModRm for call/jmp instructions");
+            FixupData[-2] = 0xe9;
+            FixupData[3] = 0x90;
+            E.setOffset(E.getOffset() - 1);
+            LLVM_DEBUG({
+              dbgs() << "  replaced jmp instruction's memory operand wih imm "
+                        "operand:\n    ";
+              printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind()));
+              dbgs() << "\n";
+            });
+          }
+          E.setKind(x86_64::Pointer32);
+          E.setTarget(GOTTarget);
+          continue;
          }
        } else if (E.getKind() == x86_64::BranchPCRel32ToPtrJumpStubBypassable) {
          auto &StubBlock = E.getTarget().getBlock();
@@ -138,6 +175,7 @@ Error optimize_x86_64_GOTAndStubs(LinkGraph &G) {
            });
          }
        }
+    }
  
    return Error::success();
  }
diff --git a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_common.s b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_common.s

index 33be9d5..de464c7 100644 (file)
--- a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_common.s
+++ b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_common.s
@@ -1,5 +1,5 @@
  # RUN: rm -rf %t && mkdir -p %t
-# RUN: llvm-mc -triple=x86_64-unknown-linux -position-independent -filetype=obj -o %t/elf_common.o %s
+# RUN: llvm-mc -triple=x86_64-unknown-linux -relax-relocations=false -position-independent -filetype=obj -o %t/elf_common.o %s
  # RUN: llvm-jitlink -entry=load_common -noexec -check %s %t/elf_common.o
  
          .text
diff --git a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_got_plt_optimizations.s b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_got_plt_optimizations.s

new file mode 100644 (file)

index 0000000..d74c56a
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_got_plt_optimizations.s
@@ -0,0 +1,63 @@
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=x86_64-unknown-linux -position-independent -filetype=obj \
+# RUN:         -o %t/elf_sm_pic_reloc.o %s
+# RUN: llvm-jitlink -noexec -slab-allocate 100Kb -slab-address 0xfff00000 \
+# RUN:              -define-abs extern_in_range32=0xffe00000 \
+# RUN:              -check %s %t/elf_sm_pic_reloc.o
+#
+
+
+        .text
+        .file   "testcase.c"
+
+# Empty main entry point.
+        .globl  main
+        .p2align        4, 0x90
+        .type   main,@function
+main:
+        retq
+
+        .size   main, .-main
+
+# Test optimization of transforming "call *foo@GOTPCREL(%rip)" to "addr call foo"
+# We need check both the target address and the instruction opcodes
+# jitlink-check: decode_operand(test_call_gotpcrelx, 0)[31:0] = extern_in_range32
+# jitlink-check: *{1}test_call_gotpcrelx = 0x67
+# jitlink-check: *{1}test_call_gotpcrelx+1 = 0xe8
+        .globl test_call_gotpcrelx
+        .p2align      4, 0x90
+        .type   test_call_gotpcrelx,@function
+test_call_gotpcrelx:
+       call    *extern_in_range32@GOTPCREL(%rip)
+
+        .size   test_call_gotpcrelx, .-test_call_gotpcrelx
+
+
+# Test optimization of transforming "jmp *foo@GOTPCREL(%rip)" to "jmp foo ; nop"
+# We need check both the target address and the instruction opcodes
+# jitlink-check: decode_operand(test_call_gotpcrelx, 0)[31:0] = extern_in_range32
+# jitlink-check: *{1}test_jmp_gotpcrelx = 0xe9
+# jitlink-check: *{1}test_jmp_gotpcrelx+5 = 0x90
+        .globl test_jmp_gotpcrelx
+        .p2align      4, 0x90
+        .type   test_jmp_gotpcrelx,@function
+test_jmp_gotpcrelx:
+       jmp    *extern_in_range32@GOTPCREL(%rip)
+
+        .size   test_jmp_gotpcrelx, .-test_jmp_gotpcrelx
+
+# Check R_X86_64_PLT32 handling with a call to an external. This produces a
+# Branch32ToStub edge, because externals are not defined locally. During
+# resolution, the target turns out to be in-range from the callsite and so the
+# edge is relaxed in post-allocation optimization.
+#
+# jitlink-check: decode_operand(test_call_extern, 0) = \
+# jitlink-check:     extern_in_range32 - next_pc(test_call_extern)
+        .globl  test_call_extern
+        .p2align       4, 0x90
+        .type   test_call_extern,@function
+test_call_extern:
+        callq   extern_in_range32@plt
+
+        .size   test_call_extern, .-test_call_extern
+
diff --git a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_small_pic_relocations.s b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_small_pic_relocations.s

index 147f2b4..b195693 100644 (file)
--- a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_small_pic_relocations.s
+++ b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_small_pic_relocations.s
@@ -3,7 +3,6 @@
  # RUN:         -o %t/elf_sm_pic_reloc.o %s
  # RUN: llvm-jitlink -noexec -slab-allocate 100Kb -slab-address 0xfff00000 \
  # RUN:              -define-abs external_data=0x1 \
-# RUN:              -define-abs extern_in_range32=0xffe00000 \
  # RUN:              -define-abs extern_out_of_range32=0x7fff00000000 \
  # RUN:              -check %s %t/elf_sm_pic_reloc.o
  #
@@ -51,21 +50,6 @@ test_call_local:
  
          .size   test_call_local, .-test_call_local
  
-# Check R_X86_64_PLT32 handling with a call to an external. This produces a
-# Branch32ToStub edge, because externals are not defined locally. During
-# resolution, the target turns out to be in-range from the callsite and so the
-# edge is relaxed in post-allocation optimization.
-#
-# jitlink-check: decode_operand(test_call_extern, 0) = \
-# jitlink-check:     extern_in_range32 - next_pc(test_call_extern)
-        .globl  test_call_extern
-        .p2align       4, 0x90
-        .type   test_call_extern,@function
-test_call_extern:
-        callq   extern_in_range32@plt
-
-        .size   test_call_extern, .-test_call_extern
-
  # Check R_X86_64_PLT32 handling with a call to an external via PLT. This
  # produces a Branch32ToStub edge, because externals are not defined locally.
  # As the target is out-of-range from the callsite, the edge keeps using its PLT
@@ -85,7 +69,9 @@ test_call_extern_plt:
          .size   test_call_extern_plt, .-test_call_extern_plt
  
  # Test GOTPCREL handling. We want to check both the offset to the GOT entry and its
-# contents.
+# contents. "movl" will be optimized to "leal" and a non-got access if the pc relative
+# offset to named_data is in range of 32 bits signed immediate. So use "leal" here to
+# suppress optimization
  # jitlink-check: decode_operand(test_gotpcrel, 4) = \
  # jitlink-check:     got_addr(elf_sm_pic_reloc.o, named_data) - next_pc(test_gotpcrel)
  # jitlink-check: *{8}(got_addr(elf_sm_pic_reloc.o, named_data)) = named_data
@@ -94,7 +80,7 @@ test_call_extern_plt:
          .p2align      4, 0x90
          .type   test_gotpcrel,@function
  test_gotpcrel:
-       movl    named_data@GOTPCREL(%rip), %eax
+       leal    named_data@GOTPCREL(%rip), %eax
  
          .size   test_gotpcrel, .-test_gotpcrel
author	luxufan <932494295@qq.com>
	Thu, 19 Aug 2021 02:13:40 +0000 (10:13 +0800)
committer	luxufan <932494295@qq.com>
	Thu, 19 Aug 2021 02:30:22 +0000 (10:30 +0800)
llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h		patch \| blob \| history
llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp		patch \| blob \| history
llvm/lib/ExecutionEngine/JITLink/x86_64.cpp		patch \| blob \| history
llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_common.s		patch \| blob \| history
llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_got_plt_optimizations.s	[new file with mode: 0644]	patch \| blob
llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_small_pic_relocations.s		patch \| blob \| history