[JITLink] Optimize GOTPCRELX Relocations
authorluxufan <932494295@qq.com>
Thu, 19 Aug 2021 02:13:40 +0000 (10:13 +0800)
committerluxufan <932494295@qq.com>
Thu, 19 Aug 2021 02:30:22 +0000 (10:30 +0800)
This patch optimize the GOTPCRELX Reloations, which is described in X86-64 psabi chapter B.2. And Not all optimization of this chapter is implemented.

1. Convert call and jmp has been implemented
2. Convert mov, but the optimization that when the symbol is defined in the lower 32-bit address space, memory operand in `mov` can be convertted into immediate operand has not been implemented.
3. Conver Test and Binop has not been implemented.

The new test file named ELF_got_plt_optimizations.s has been added, and I moved some test cases about optimization of got/plt from ELF_x86_64_small_pic_relocations.s to the new test file.

By referencing the lld, so, the optimization `Convert call and jmp` is not same as what psabi says, and I have explained it in the comment.

Reviewed By: lhames

Differential Revision: https://reviews.llvm.org/D108280

llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h
llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_common.s
llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_got_plt_optimizations.s [new file with mode: 0644]
llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_small_pic_relocations.s

index dbd8866..36e346c 100644 (file)
@@ -24,7 +24,8 @@ enum ELFX86RelocationKind : Edge::Kind {
   Pointer64,
   PCRel32,
   PCRel32GOTLoad,
-  PCRel32REXGOTLoad,
+  PCRel32GOTLoadRelaxable,
+  PCRel32REXGOTLoadRelaxable,
   PCRel64GOT,
   GOTOFF64,
   GOT64,
index ec0077c..736bcad 100644 (file)
@@ -184,10 +184,11 @@ private:
     case ELF::R_X86_64_64:
       return ELF_x86_64_Edges::ELFX86RelocationKind::Pointer64;
     case ELF::R_X86_64_GOTPCREL:
-    case ELF::R_X86_64_GOTPCRELX:
       return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32GOTLoad;
+    case ELF::R_X86_64_GOTPCRELX:
+      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32GOTLoadRelaxable;
     case ELF::R_X86_64_REX_GOTPCRELX:
-      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32REXGOTLoad;
+      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32REXGOTLoadRelaxable;
     case ELF::R_X86_64_GOTPCREL64:
       return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel64GOT;
     case ELF::R_X86_64_GOT64:
@@ -301,15 +302,19 @@ private:
           Kind = x86_64::Pointer64;
           break;
         case PCRel32GOTLoad: {
-          Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable;
-          Addend = 0;
+          Kind = x86_64::RequestGOTAndTransformToDelta32;
           break;
         }
-        case PCRel32REXGOTLoad: {
+        case PCRel32REXGOTLoadRelaxable: {
           Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable;
           Addend = 0;
           break;
         }
+        case PCRel32GOTLoadRelaxable: {
+          Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable;
+          Addend = 0;
+          break;
+        }
         case PCRel64GOT: {
           Kind = x86_64::RequestGOTAndTransformToDelta64;
           break;
@@ -498,7 +503,9 @@ const char *getELFX86RelocationKindName(Edge::Kind R) {
     return "PCRel32";
   case PCRel32GOTLoad:
     return "PCRel32GOTLoad";
-  case PCRel32REXGOTLoad:
+  case PCRel32GOTLoadRelaxable:
+    return "PCRel32GOTLoadRelaxable";
+  case PCRel32REXGOTLoadRelaxable:
     return "PCRel32REXGOTLoad";
   case PCRel64GOT:
     return "PCRel64GOT";
index 354442c..a95b34b 100644 (file)
@@ -73,42 +73,79 @@ Error optimize_x86_64_GOTAndStubs(LinkGraph &G) {
   LLVM_DEBUG(dbgs() << "Optimizing GOT entries and stubs:\n");
 
   for (auto *B : G.blocks())
-    for (auto &E : B->edges())
-      if (E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable) {
-        // Replace GOT load with LEA only for MOVQ instructions.
-        assert(E.getOffset() >= 3 && "GOT edge occurs too early in block");
-
-        constexpr uint8_t MOVQRIPRel[] = {0x48, 0x8b};
-        if (strncmp(B->getContent().data() + E.getOffset() - 3,
-                    reinterpret_cast<const char *>(MOVQRIPRel), 2) != 0)
-          continue;
-
-        auto &GOTBlock = E.getTarget().getBlock();
-        assert(GOTBlock.getSize() == G.getPointerSize() &&
+    for (auto &E : B->edges()) {
+      if (E.getKind() == x86_64::PCRel32GOTLoadRelaxable ||
+          E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable) {
+        bool REXPrefix = E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable;
+        assert(E.getOffset() >= (REXPrefix ? 3 : 2) &&
+               "GOT edge occurs too early in block");
+        auto *FixupData = reinterpret_cast<uint8_t *>(
+                              const_cast<char *>(B->getContent().data())) +
+                          E.getOffset();
+        const uint8_t Op = FixupData[-2];
+        const uint8_t ModRM = FixupData[-1];
+
+        auto &GOTEntryBlock = E.getTarget().getBlock();
+        assert(GOTEntryBlock.getSize() == G.getPointerSize() &&
                "GOT entry block should be pointer sized");
-        assert(GOTBlock.edges_size() == 1 &&
+        assert(GOTEntryBlock.edges_size() == 1 &&
                "GOT entry should only have one outgoing edge");
-
-        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
-        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
+        auto &GOTTarget = GOTEntryBlock.edges().begin()->getTarget();
         JITTargetAddress TargetAddr = GOTTarget.getAddress();
-
+        JITTargetAddress EdgeAddr = B->getFixupAddress(E);
         int64_t Displacement = TargetAddr - EdgeAddr + 4;
-        if (isInRangeForImmS32(Displacement)) {
-          // Change the edge kind as we don't go through GOT anymore. This is
-          // for formal correctness only. Technically, the two relocation kinds
-          // are resolved the same way.
+        bool TargetInRangeForImmU32 = isInRangeForImmU32(TargetAddr);
+        bool DisplacementInRangeForImmS32 = isInRangeForImmS32(Displacement);
+
+        // If both of the Target and displacement is out of range, then
+        // there isn't optimization chance.
+        if (!(TargetInRangeForImmU32 || DisplacementInRangeForImmS32))
+          continue;
+
+        // Transform "mov foo@GOTPCREL(%rip),%reg" to "lea foo(%rip),%reg".
+        if (Op == 0x8b && DisplacementInRangeForImmS32) {
+          FixupData[-2] = 0x8d;
           E.setKind(x86_64::Delta32);
           E.setTarget(GOTTarget);
           E.setAddend(E.getAddend() - 4);
-          auto *BlockData = reinterpret_cast<uint8_t *>(
-              const_cast<char *>(B->getContent().data()));
-          BlockData[E.getOffset() - 2] = 0x8d;
           LLVM_DEBUG({
             dbgs() << "  Replaced GOT load wih LEA:\n    ";
             printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind()));
             dbgs() << "\n";
           });
+          continue;
+        }
+
+        // Transform call/jmp instructions
+        if (Op == 0xff && TargetInRangeForImmU32) {
+          if (ModRM == 0x15) {
+            // ABI says we can convert "call *foo@GOTPCREL(%rip)" to "nop; call
+            // foo" But lld convert it to "addr32 call foo, because that makes
+            // result expression to be a single instruction.
+            FixupData[-2] = 0x67;
+            FixupData[-1] = 0xe8;
+            LLVM_DEBUG({
+              dbgs() << "  replaced call instruction's memory operand wih imm "
+                        "operand:\n    ";
+              printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind()));
+              dbgs() << "\n";
+            });
+          } else {
+            // Transform "jmp *foo@GOTPCREL(%rip)" to "jmp foo; nop"
+            assert(ModRM == 0x25 && "Invalid ModRm for call/jmp instructions");
+            FixupData[-2] = 0xe9;
+            FixupData[3] = 0x90;
+            E.setOffset(E.getOffset() - 1);
+            LLVM_DEBUG({
+              dbgs() << "  replaced jmp instruction's memory operand wih imm "
+                        "operand:\n    ";
+              printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind()));
+              dbgs() << "\n";
+            });
+          }
+          E.setKind(x86_64::Pointer32);
+          E.setTarget(GOTTarget);
+          continue;
         }
       } else if (E.getKind() == x86_64::BranchPCRel32ToPtrJumpStubBypassable) {
         auto &StubBlock = E.getTarget().getBlock();
@@ -138,6 +175,7 @@ Error optimize_x86_64_GOTAndStubs(LinkGraph &G) {
           });
         }
       }
+    }
 
   return Error::success();
 }
index 33be9d5..de464c7 100644 (file)
@@ -1,5 +1,5 @@
 # RUN: rm -rf %t && mkdir -p %t
-# RUN: llvm-mc -triple=x86_64-unknown-linux -position-independent -filetype=obj -o %t/elf_common.o %s
+# RUN: llvm-mc -triple=x86_64-unknown-linux -relax-relocations=false -position-independent -filetype=obj -o %t/elf_common.o %s
 # RUN: llvm-jitlink -entry=load_common -noexec -check %s %t/elf_common.o
 
         .text
diff --git a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_got_plt_optimizations.s b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_got_plt_optimizations.s
new file mode 100644 (file)
index 0000000..d74c56a
--- /dev/null
@@ -0,0 +1,63 @@
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=x86_64-unknown-linux -position-independent -filetype=obj \
+# RUN:         -o %t/elf_sm_pic_reloc.o %s
+# RUN: llvm-jitlink -noexec -slab-allocate 100Kb -slab-address 0xfff00000 \
+# RUN:              -define-abs extern_in_range32=0xffe00000 \
+# RUN:              -check %s %t/elf_sm_pic_reloc.o
+#
+
+
+        .text
+        .file   "testcase.c"
+
+# Empty main entry point.
+        .globl  main
+        .p2align        4, 0x90
+        .type   main,@function
+main:
+        retq
+
+        .size   main, .-main
+
+# Test optimization of transforming "call *foo@GOTPCREL(%rip)" to "addr call foo"
+# We need check both the target address and the instruction opcodes
+# jitlink-check: decode_operand(test_call_gotpcrelx, 0)[31:0] = extern_in_range32
+# jitlink-check: *{1}test_call_gotpcrelx = 0x67
+# jitlink-check: *{1}test_call_gotpcrelx+1 = 0xe8
+        .globl test_call_gotpcrelx
+        .p2align      4, 0x90
+        .type   test_call_gotpcrelx,@function
+test_call_gotpcrelx:
+       call    *extern_in_range32@GOTPCREL(%rip)
+
+        .size   test_call_gotpcrelx, .-test_call_gotpcrelx
+
+
+# Test optimization of transforming "jmp *foo@GOTPCREL(%rip)" to "jmp foo ; nop"
+# We need check both the target address and the instruction opcodes
+# jitlink-check: decode_operand(test_call_gotpcrelx, 0)[31:0] = extern_in_range32
+# jitlink-check: *{1}test_jmp_gotpcrelx = 0xe9
+# jitlink-check: *{1}test_jmp_gotpcrelx+5 = 0x90
+        .globl test_jmp_gotpcrelx
+        .p2align      4, 0x90
+        .type   test_jmp_gotpcrelx,@function
+test_jmp_gotpcrelx:
+       jmp    *extern_in_range32@GOTPCREL(%rip)
+
+        .size   test_jmp_gotpcrelx, .-test_jmp_gotpcrelx
+
+# Check R_X86_64_PLT32 handling with a call to an external. This produces a
+# Branch32ToStub edge, because externals are not defined locally. During
+# resolution, the target turns out to be in-range from the callsite and so the
+# edge is relaxed in post-allocation optimization.
+#
+# jitlink-check: decode_operand(test_call_extern, 0) = \
+# jitlink-check:     extern_in_range32 - next_pc(test_call_extern)
+        .globl  test_call_extern
+        .p2align       4, 0x90
+        .type   test_call_extern,@function
+test_call_extern:
+        callq   extern_in_range32@plt
+
+        .size   test_call_extern, .-test_call_extern
+
index 147f2b4..b195693 100644 (file)
@@ -3,7 +3,6 @@
 # RUN:         -o %t/elf_sm_pic_reloc.o %s
 # RUN: llvm-jitlink -noexec -slab-allocate 100Kb -slab-address 0xfff00000 \
 # RUN:              -define-abs external_data=0x1 \
-# RUN:              -define-abs extern_in_range32=0xffe00000 \
 # RUN:              -define-abs extern_out_of_range32=0x7fff00000000 \
 # RUN:              -check %s %t/elf_sm_pic_reloc.o
 #
@@ -51,21 +50,6 @@ test_call_local:
 
         .size   test_call_local, .-test_call_local
 
-# Check R_X86_64_PLT32 handling with a call to an external. This produces a
-# Branch32ToStub edge, because externals are not defined locally. During
-# resolution, the target turns out to be in-range from the callsite and so the
-# edge is relaxed in post-allocation optimization.
-#
-# jitlink-check: decode_operand(test_call_extern, 0) = \
-# jitlink-check:     extern_in_range32 - next_pc(test_call_extern)
-        .globl  test_call_extern
-        .p2align       4, 0x90
-        .type   test_call_extern,@function
-test_call_extern:
-        callq   extern_in_range32@plt
-
-        .size   test_call_extern, .-test_call_extern
-
 # Check R_X86_64_PLT32 handling with a call to an external via PLT. This
 # produces a Branch32ToStub edge, because externals are not defined locally.
 # As the target is out-of-range from the callsite, the edge keeps using its PLT
@@ -85,7 +69,9 @@ test_call_extern_plt:
         .size   test_call_extern_plt, .-test_call_extern_plt
 
 # Test GOTPCREL handling. We want to check both the offset to the GOT entry and its
-# contents.
+# contents. "movl" will be optimized to "leal" and a non-got access if the pc relative
+# offset to named_data is in range of 32 bits signed immediate. So use "leal" here to
+# suppress optimization
 # jitlink-check: decode_operand(test_gotpcrel, 4) = \
 # jitlink-check:     got_addr(elf_sm_pic_reloc.o, named_data) - next_pc(test_gotpcrel)
 # jitlink-check: *{8}(got_addr(elf_sm_pic_reloc.o, named_data)) = named_data
@@ -94,7 +80,7 @@ test_call_extern_plt:
         .p2align      4, 0x90
         .type   test_gotpcrel,@function
 test_gotpcrel:
-       movl    named_data@GOTPCREL(%rip), %eax
+       leal    named_data@GOTPCREL(%rip), %eax
 
         .size   test_gotpcrel, .-test_gotpcrel