[ORC] Add lazy jit support for LoongArch64
authorwanglei <wanglei@loongson.cn>
Sat, 21 Jan 2023 05:31:33 +0000 (13:31 +0800)
committerwanglei <wanglei@loongson.cn>
Sat, 21 Jan 2023 09:49:36 +0000 (17:49 +0800)
This patch adds resolver, indirection and trampoline stubs for
loongarch64, allowing lazy compilation to work.

It assumes hard float feature exists.

Depends on D141036

Reviewed By: lhames

Differential Revision: https://reviews.llvm.org/D141102

llvm/include/llvm/ExecutionEngine/Orc/OrcABISupport.h
llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp

index c5c2780..3048547 100644 (file)
@@ -369,6 +369,46 @@ public:
       JITTargetAddress PointersBlockTargetAddress, unsigned NumStubs);
 };
 
+// @brief loongarch64 support.
+//
+// LoongArch 64 supports lazy JITing.
+class OrcLoongArch64 {
+public:
+  static constexpr unsigned PointerSize = 8;
+  static constexpr unsigned TrampolineSize = 16;
+  static constexpr unsigned StubSize = 16;
+  static constexpr unsigned StubToPointerMaxDisplacement = 1 << 31;
+  static constexpr unsigned ResolverCodeSize = 0xc8;
+
+  /// Write the resolver code into the given memory. The user is
+  /// responsible for allocating the memory and setting permissions.
+  ///
+  /// ReentryFnAddr should be the address of a function whose signature matches
+  /// void* (*)(void *TrampolineAddr, void *ReentryCtxAddr). The ReentryCtxAddr
+  /// argument of writeResolverCode will be passed as the second argument to
+  /// the function at ReentryFnAddr.
+  static void writeResolverCode(char *ResolverWorkingMem,
+                                JITTargetAddress ResolverTargetAddress,
+                                JITTargetAddress ReentryFnAddr,
+                                JITTargetAddress ReentryCtxAddr);
+
+  /// Write the requested number of trampolines into the given memory,
+  /// which must be big enough to hold 1 pointer, plus NumTrampolines
+  /// trampolines.
+  static void writeTrampolines(char *TrampolineBlockWorkingMem,
+                               JITTargetAddress TrampolineBlockTargetAddress,
+                               JITTargetAddress ResolverFnAddr,
+                               unsigned NumTrampolines);
+
+  /// Write NumStubs indirect stubs to working memory at StubsBlockWorkingMem.
+  /// Stubs will be written as if linked at StubsBlockTargetAddress, with the
+  /// Nth stub using the Nth pointer in memory starting at
+  /// PointersBlockTargetAddress.
+  static void writeIndirectStubsBlock(
+      char *StubsBlockWorkingMem, JITTargetAddress StubsBlockTargetAddress,
+      JITTargetAddress PointersBlockTargetAddress, unsigned NumStubs);
+};
+
 } // end namespace orc
 } // end namespace llvm
 
index 48aaab9..ddfb305 100644 (file)
@@ -250,6 +250,9 @@ EPCIndirectionUtils::Create(ExecutorProcessControl &EPC) {
   case Triple::x86:
     return CreateWithABI<OrcI386>(EPC);
 
+  case Triple::loongarch64:
+    return CreateWithABI<OrcLoongArch64>(EPC);
+
   case Triple::mips:
     return CreateWithABI<OrcMips32Be>(EPC);
 
index 6ebed60..989bb09 100644 (file)
@@ -137,6 +137,11 @@ createLocalCompileCallbackManager(const Triple &T, ExecutionSession &ES,
       return CCMgrT::Create(ES, ErrorHandlerAddress);
     }
 
+    case Triple::loongarch64: {
+      typedef orc::LocalJITCompileCallbackManager<orc::OrcLoongArch64> CCMgrT;
+      return CCMgrT::Create(ES, ErrorHandlerAddress);
+    }
+
     case Triple::mips: {
       typedef orc::LocalJITCompileCallbackManager<orc::OrcMips32Be> CCMgrT;
       return CCMgrT::Create(ES, ErrorHandlerAddress);
@@ -192,6 +197,12 @@ createLocalIndirectStubsManagerBuilder(const Triple &T) {
                        orc::LocalIndirectStubsManager<orc::OrcI386>>();
       };
 
+    case Triple::loongarch64:
+      return []() {
+        return std::make_unique<
+            orc::LocalIndirectStubsManager<orc::OrcLoongArch64>>();
+      };
+
     case Triple::mips:
       return [](){
           return std::make_unique<
index 20b655b..c0a740d 100644 (file)
@@ -119,6 +119,10 @@ createLocalLazyCallThroughManager(const Triple &T, ExecutionSession &ES,
   case Triple::x86:
     return LocalLazyCallThroughManager::Create<OrcI386>(ES, ErrorHandlerAddr);
 
+  case Triple::loongarch64:
+    return LocalLazyCallThroughManager::Create<OrcLoongArch64>(
+        ES, ErrorHandlerAddr);
+
   case Triple::mips:
     return LocalLazyCallThroughManager::Create<OrcMips32Be>(ES,
                                                             ErrorHandlerAddr);
index da8aaad..48dd0df 100644 (file)
@@ -1077,5 +1077,158 @@ void OrcRiscv64::writeIndirectStubsBlock(
   }
 }
 
+void OrcLoongArch64::writeResolverCode(char *ResolverWorkingMem,
+                                       JITTargetAddress ResolverTargetAddress,
+                                       JITTargetAddress ReentryFnAddr,
+                                       JITTargetAddress ReentryCtxAddr) {
+
+  LLVM_DEBUG({
+    dbgs() << "Writing resolver code to "
+           << formatv("{0:x16}", ResolverTargetAddress) << "\n";
+  });
+
+  const uint32_t ResolverCode[] = {
+      0x02fde063, // 0x0: addi.d $sp, $sp, -136(0xf78)
+      0x29c00061, // 0x4: st.d $ra, $sp, 0
+      0x29c02064, // 0x8: st.d $a0, $sp, 8(0x8)
+      0x29c04065, // 0xc: st.d $a1, $sp, 16(0x10)
+      0x29c06066, // 0x10: st.d $a2, $sp, 24(0x18)
+      0x29c08067, // 0x14: st.d $a3, $sp, 32(0x20)
+      0x29c0a068, // 0x18: st.d $a4, $sp, 40(0x28)
+      0x29c0c069, // 0x1c: st.d $a5, $sp, 48(0x30)
+      0x29c0e06a, // 0x20: st.d $a6, $sp, 56(0x38)
+      0x29c1006b, // 0x24: st.d $a7, $sp, 64(0x40)
+      0x2bc12060, // 0x28: fst.d $fa0, $sp, 72(0x48)
+      0x2bc14061, // 0x2c: fst.d $fa1, $sp, 80(0x50)
+      0x2bc16062, // 0x30: fst.d $fa2, $sp, 88(0x58)
+      0x2bc18063, // 0x34: fst.d $fa3, $sp, 96(0x60)
+      0x2bc1a064, // 0x38: fst.d $fa4, $sp, 104(0x68)
+      0x2bc1c065, // 0x3c: fst.d $fa5, $sp, 112(0x70)
+      0x2bc1e066, // 0x40: fst.d $fa6, $sp, 120(0x78)
+      0x2bc20067, // 0x44: fst.d $fa7, $sp, 128(0x80)
+      0x1c000004, // 0x48: pcaddu12i $a0, 0
+      0x28c1c084, // 0x4c: ld.d $a0, $a0, 112(0x70)
+      0x001501a5, // 0x50: move $a1, $t1
+      0x02ffd0a5, // 0x54: addi.d $a1, $a1, -12(0xff4)
+      0x1c000006, // 0x58: pcaddu12i $a2, 0
+      0x28c1a0c6, // 0x5c: ld.d $a2, $a2, 104(0x68)
+      0x4c0000c1, // 0x60: jirl $ra, $a2, 0
+      0x0015008c, // 0x64: move $t0, $a0
+      0x2b820067, // 0x68: fld.d $fa7, $sp, 128(0x80)
+      0x2b81e066, // 0x6c: fld.d $fa6, $sp, 120(0x78)
+      0x2b81c065, // 0x70: fld.d $fa5, $sp, 112(0x70)
+      0x2b81a064, // 0x74: fld.d $fa4, $sp, 104(0x68)
+      0x2b818063, // 0x78: fld.d $fa3, $sp, 96(0x60)
+      0x2b816062, // 0x7c: fld.d $fa2, $sp, 88(0x58)
+      0x2b814061, // 0x80: fld.d $fa1, $sp, 80(0x50)
+      0x2b812060, // 0x84: fld.d $fa0, $sp, 72(0x48)
+      0x28c1006b, // 0x88: ld.d $a7, $sp, 64(0x40)
+      0x28c0e06a, // 0x8c: ld.d $a6, $sp, 56(0x38)
+      0x28c0c069, // 0x90: ld.d $a5, $sp, 48(0x30)
+      0x28c0a068, // 0x94: ld.d $a4, $sp, 40(0x28)
+      0x28c08067, // 0x98: ld.d $a3, $sp, 32(0x20)
+      0x28c06066, // 0x9c: ld.d $a2, $sp, 24(0x18)
+      0x28c04065, // 0xa0: ld.d $a1, $sp, 16(0x10)
+      0x28c02064, // 0xa4: ld.d $a0, $sp, 8(0x8)
+      0x28c00061, // 0xa8: ld.d $ra, $sp, 0
+      0x02c22063, // 0xac: addi.d $sp, $sp, 136(0x88)
+      0x4c000180, // 0xb0: jr $t0
+      0x00000000, // 0xb4: padding to align at 8 bytes
+      0x01234567, // 0xb8: Lreentry_ctx_ptr:
+      0xdeedbeef, // 0xbc:      .dword 0
+      0x98765432, // 0xc0: Lreentry_fn_ptr:
+      0xcafef00d, // 0xc4:      .dword 0
+  };
+
+  const unsigned ReentryCtxAddrOffset = 0xb8;
+  const unsigned ReentryFnAddrOffset = 0xc0;
+
+  memcpy(ResolverWorkingMem, ResolverCode, sizeof(ResolverCode));
+  memcpy(ResolverWorkingMem + ReentryFnAddrOffset, &ReentryFnAddr,
+         sizeof(uint64_t));
+  memcpy(ResolverWorkingMem + ReentryCtxAddrOffset, &ReentryCtxAddr,
+         sizeof(uint64_t));
+}
+
+void OrcLoongArch64::writeTrampolines(
+    char *TrampolineBlockWorkingMem,
+    JITTargetAddress TrampolineBlockTargetAddress,
+    JITTargetAddress ResolverAddr, unsigned NumTrampolines) {
+
+  LLVM_DEBUG({
+    dbgs() << "Writing trampoline code to "
+           << formatv("{0:x16}", TrampolineBlockTargetAddress) << "\n";
+  });
+
+  unsigned OffsetToPtr = alignTo(NumTrampolines * TrampolineSize, 8);
+
+  memcpy(TrampolineBlockWorkingMem + OffsetToPtr, &ResolverAddr,
+         sizeof(uint64_t));
+
+  uint32_t *Trampolines =
+      reinterpret_cast<uint32_t *>(TrampolineBlockWorkingMem);
+  for (unsigned I = 0; I < NumTrampolines; ++I, OffsetToPtr -= TrampolineSize) {
+    uint32_t Hi20 = (OffsetToPtr + 0x800) & 0xfffff000;
+    uint32_t Lo12 = OffsetToPtr - Hi20;
+    Trampolines[4 * I + 0] =
+        0x1c00000c |
+        (((Hi20 >> 12) & 0xfffff) << 5); // pcaddu12i $t0, %pc_hi20(Lptr)
+    Trampolines[4 * I + 1] =
+        0x28c0018c | ((Lo12 & 0xfff) << 10); // ld.d $t0, $t0, %pc_lo12(Lptr)
+    Trampolines[4 * I + 2] = 0x4c00018d;     // jirl $t1, $t0, 0
+    Trampolines[4 * I + 3] = 0x0;            // padding
+  }
+}
+
+void OrcLoongArch64::writeIndirectStubsBlock(
+    char *StubsBlockWorkingMem, JITTargetAddress StubsBlockTargetAddress,
+    JITTargetAddress PointersBlockTargetAddress, unsigned NumStubs) {
+  // Stub format is:
+  //
+  // .section __orc_stubs
+  // stub1:
+  //        pcaddu12i $t0, %pc_hi20(ptr1)      ; PC-rel load of ptr1
+  //        ld.d      $t0, $t0, %pc_lo12(ptr1)
+  //        jr        $t0                      ; Jump to resolver
+  //        .dword    0                        ; Pad to 16 bytes
+  // stub2:
+  //        pcaddu12i $t0, %pc_hi20(ptr2)      ; PC-rel load of ptr2
+  //        ld.d      $t0, $t0, %pc_lo12(ptr2)
+  //        jr        $t0                      ; Jump to resolver
+  //        .dword    0                        ; Pad to 16 bytes
+  // ...
+  //
+  // .section __orc_ptrs
+  // ptr1:
+  //        .dword 0x0
+  // ptr2:
+  //        .dword 0x0
+  // ...
+  LLVM_DEBUG({
+    dbgs() << "Writing stubs code to "
+           << formatv("{0:x16}", StubsBlockTargetAddress) << "\n";
+  });
+  assert(stubAndPointerRangesOk<OrcLoongArch64>(
+             StubsBlockTargetAddress, PointersBlockTargetAddress, NumStubs) &&
+         "PointersBlock is out of range");
+
+  uint32_t *Stub = reinterpret_cast<uint32_t *>(StubsBlockWorkingMem);
+
+  for (unsigned I = 0; I < NumStubs; ++I) {
+    uint64_t PtrDisplacement =
+        PointersBlockTargetAddress - StubsBlockTargetAddress;
+    uint32_t Hi20 = (PtrDisplacement + 0x800) & 0xfffff000;
+    uint32_t Lo12 = PtrDisplacement - Hi20;
+    Stub[4 * I + 0] = 0x1c00000c | (((Hi20 >> 12) & 0xfffff)
+                                    << 5); // pcaddu12i $t0, %pc_hi20(Lptr)
+    Stub[4 * I + 1] =
+        0x28c0018c | ((Lo12 & 0xfff) << 10); // ld.d $t0, $t0, %pc_lo12(Lptr)
+    Stub[4 * I + 2] = 0x4c000180;            // jr $t0
+    Stub[4 * I + 3] = 0x0;                   // padding
+    PointersBlockTargetAddress += PointerSize;
+    StubsBlockTargetAddress += StubSize;
+  }
+}
+
 } // End namespace orc.
 } // End namespace llvm.