From 5a3a527f8ae220583981a122fda2a2c2eefa1a41 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Tue, 27 Dec 2022 22:18:57 -0500
Subject: [PATCH] [OpenMP] Introduce basic JIT support to OpenMP target
 offloading

This patch adds the basic JIT support for OpenMP. Currently it only works on Nvidia GPUs.

The support for AMDGPU can be extended easily by just implementing three interface functions. However, the infrastructure requires a small extra extension (add a pre process hook) to support portability for AMDGPU because the AMDGPU backend reads target features of functions. https://github.com/shiltian/llvm-project/commit/02bc7effccc6ff2f5ab3fe5218336094c0485766#diff-321c2038035972ad4994ff9d85b29950ba72c08a79891db5048b8f5d46915314R432 shows how it roughly works.

As for the test, even though I added the corresponding code in CMake files, the test still cannot be triggered because some code is missing in the new plugin CMake file, which has nothing to do with this patch. It will be fixed later.

In order to enable JIT mode, when compiling, `-foffload-lto` is needed, and when linking, `-foffload-lto -Wl,--embed-bitcode` is needed. That implies that, LTO is required to enable JIT mode.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D139287
---
 openmp/libomptarget/CMakeLists.txt                 |   3 +-
 openmp/libomptarget/plugins-nextgen/CMakeLists.txt |   3 +
 .../common/PluginInterface/CMakeLists.txt          |  35 +-
 .../plugins-nextgen/common/PluginInterface/JIT.cpp | 375 +++++++++++++++++++++
 .../plugins-nextgen/common/PluginInterface/JIT.h   |  50 +++
 .../common/PluginInterface/PluginInterface.cpp     |  38 ++-
 .../common/PluginInterface/PluginInterface.h       |  17 +
 .../libomptarget/plugins-nextgen/cuda/src/rtl.cpp  |  25 ++
 .../plugins-nextgen/generic-elf-64bit/src/rtl.cpp  |   4 +
 openmp/libomptarget/test/lit.cfg                   |  16 +
 10 files changed, 561 insertions(+), 5 deletions(-)
 create mode 100644 openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.cpp
 create mode 100644 openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.h

diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
index 21f211a..bc6e615 100644
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -50,6 +50,7 @@ set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu-LTO")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-LTO")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-JIT-LTO")
 
 # Once the plugins for the different targets are validated, they will be added to
 # the list of supported targets in the current system.
@@ -73,7 +74,7 @@ endif()
 # Follow host OMPT support and check if host support has been requested.
 # LIBOMP_HAVE_OMPT_SUPPORT indicates whether host OMPT support has been implemented.
 # LIBOMP_OMPT_SUPPORT indicates whether host OMPT support has been requested (default is ON).
-# LIBOMPTARGET_OMPT_SUPPORT indicates whether target OMPT support has been requested (default is ON). 
+# LIBOMPTARGET_OMPT_SUPPORT indicates whether target OMPT support has been requested (default is ON).
 set(OMPT_TARGET_DEFAULT FALSE)
 if ((LIBOMP_HAVE_OMPT_SUPPORT) AND (LIBOMP_OMPT_SUPPORT) AND (NOT WIN32))
   set (OMPT_TARGET_DEFAULT TRUE)
diff --git a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
index e775123..95e359c 100644
--- a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
@@ -33,6 +33,9 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
     # Define macro with the ELF ID for this target.
     add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
 
+    # Define target regiple
+    add_definitions("-DLIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE=${tmachine}")
+
     add_llvm_library("omptarget.rtl.${tmachine_libname}.nextgen"
       SHARED
 
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
index e8df5ff..9abd430 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
@@ -12,7 +12,14 @@
 
 # NOTE: Don't try to build `PluginInterface` using `add_llvm_library` because we
 # don't want to export `PluginInterface` while `add_llvm_library` requires that.
-add_library(PluginInterface OBJECT PluginInterface.cpp GlobalHandler.cpp)
+add_library(PluginInterface OBJECT
+  PluginInterface.cpp GlobalHandler.cpp JIT.cpp)
+
+# Only enable JIT for those targets that LLVM can support.
+string(TOUPPER "${LLVM_TARGETS_TO_BUILD}" TargetsSupported)
+foreach(Target ${TargetsSupported})
+  target_compile_definitions(PluginInterface PRIVATE "LIBOMPTARGET_JIT_${TARGET}")
+endforeach()
 
 # This is required when using LLVM libraries.
 llvm_update_compile_flags(PluginInterface)
@@ -20,7 +27,31 @@ llvm_update_compile_flags(PluginInterface)
 if (LLVM_LINK_LLVM_DYLIB)
   set(llvm_libs LLVM)
 else()
-  llvm_map_components_to_libnames(llvm_libs Support)
+  llvm_map_components_to_libnames(llvm_libs
+    ${LLVM_TARGETS_TO_BUILD}
+    AggressiveInstCombine
+    Analysis
+    BinaryFormat
+    BitReader
+    BitWriter
+    CodeGen
+    Core
+    Extensions
+    InstCombine
+    Instrumentation
+    IPO
+    IRReader
+    Linker
+    MC
+    Object
+    Passes
+    Remarks
+    ScalarOpts
+    Support
+    Target
+    TransformUtils
+    Vectorize
+  )
 endif()
 
 target_link_libraries(PluginInterface
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.cpp
new file mode 100644
index 0000000..0d42c6d
--- /dev/null
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.cpp
@@ -0,0 +1,375 @@
+//===- JIT.cpp - Target independent JIT infrastructure --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "JIT.h"
+#include "Debug.h"
+
+#include "omptarget.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LLVMRemarkStreamer.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Object/IRObjectFile.h"
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/TimeProfiler.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#include <mutex>
+
+using namespace llvm;
+using namespace llvm::object;
+using namespace omp;
+
+static codegen::RegisterCodeGenFlags RCGF;
+
+namespace {
+std::once_flag InitFlag;
+
+void init(Triple TT) {
+  bool JITTargetInitialized = false;
+#ifdef LIBOMPTARGET_JIT_NVPTX
+  if (TT.isNVPTX()) {
+    LLVMInitializeNVPTXTargetInfo();
+    LLVMInitializeNVPTXTarget();
+    LLVMInitializeNVPTXTargetMC();
+    LLVMInitializeNVPTXAsmPrinter();
+    JITTargetInitialized = true;
+  }
+#endif
+#ifdef LIBOMPTARGET_JIT_AMDGPU
+  if (TT.isAMDGPU()) {
+    LLVMInitializeAMDGPUTargetInfo();
+    LLVMInitializeAMDGPUTarget();
+    LLVMInitializeAMDGPUTargetMC();
+    LLVMInitializeAMDGPUAsmPrinter();
+    JITTargetInitialized = true;
+  }
+#endif
+  if (!JITTargetInitialized) {
+    FAILURE_MESSAGE("unsupported JIT target");
+    abort();
+  }
+
+  // Initialize passes
+  PassRegistry &Registry = *PassRegistry::getPassRegistry();
+  initializeCore(Registry);
+  initializeScalarOpts(Registry);
+  initializeVectorization(Registry);
+  initializeIPO(Registry);
+  initializeAnalysis(Registry);
+  initializeTransformUtils(Registry);
+  initializeInstCombine(Registry);
+  initializeTarget(Registry);
+
+  initializeExpandLargeDivRemLegacyPassPass(Registry);
+  initializeExpandLargeFpConvertLegacyPassPass(Registry);
+  initializeExpandMemCmpPassPass(Registry);
+  initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
+  initializeSelectOptimizePass(Registry);
+  initializeCodeGenPreparePass(Registry);
+  initializeAtomicExpandPass(Registry);
+  initializeRewriteSymbolsLegacyPassPass(Registry);
+  initializeWinEHPreparePass(Registry);
+  initializeDwarfEHPrepareLegacyPassPass(Registry);
+  initializeSafeStackLegacyPassPass(Registry);
+  initializeSjLjEHPreparePass(Registry);
+  initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
+  initializeGlobalMergePass(Registry);
+  initializeIndirectBrExpandPassPass(Registry);
+  initializeInterleavedLoadCombinePass(Registry);
+  initializeInterleavedAccessPass(Registry);
+  initializeUnreachableBlockElimLegacyPassPass(Registry);
+  initializeExpandReductionsPass(Registry);
+  initializeExpandVectorPredicationPass(Registry);
+  initializeWasmEHPreparePass(Registry);
+  initializeWriteBitcodePassPass(Registry);
+  initializeHardwareLoopsPass(Registry);
+  initializeTypePromotionPass(Registry);
+  initializeReplaceWithVeclibLegacyPass(Registry);
+  initializeJMCInstrumenterPass(Registry);
+}
+
+Expected<std::unique_ptr<Module>>
+createModuleFromImage(__tgt_device_image *Image, LLVMContext &Context) {
+  StringRef Data((const char *)Image->ImageStart,
+                 (char *)Image->ImageEnd - (char *)Image->ImageStart);
+  std::unique_ptr<MemoryBuffer> MB = MemoryBuffer::getMemBuffer(
+      Data, /* BufferName */ "", /* RequiresNullTerminator */ false);
+  SMDiagnostic Err;
+  auto Mod = parseIR(*MB, Err, Context);
+  if (!Mod)
+    return make_error<StringError>("Failed to create module",
+                                   inconvertibleErrorCode());
+  return Mod;
+}
+
+CodeGenOpt::Level getCGOptLevel(unsigned OptLevel) {
+  switch (OptLevel) {
+  case 0:
+    return CodeGenOpt::None;
+  case 1:
+    return CodeGenOpt::Less;
+  case 2:
+    return CodeGenOpt::Default;
+  case 3:
+    return CodeGenOpt::Aggressive;
+  }
+  llvm_unreachable("Invalid optimization level");
+}
+
+OptimizationLevel getOptLevel(unsigned OptLevel) {
+  switch (OptLevel) {
+  case 0:
+    return OptimizationLevel::O0;
+  case 1:
+    return OptimizationLevel::O1;
+  case 2:
+    return OptimizationLevel::O2;
+  case 3:
+    return OptimizationLevel::O3;
+  }
+  llvm_unreachable("Invalid optimization level");
+}
+
+Expected<std::unique_ptr<TargetMachine>>
+createTargetMachine(Module &M, std::string CPU, unsigned OptLevel) {
+  Triple TT(M.getTargetTriple());
+  CodeGenOpt::Level CGOptLevel = getCGOptLevel(OptLevel);
+
+  std::string Msg;
+  const Target *T = TargetRegistry::lookupTarget(M.getTargetTriple(), Msg);
+  if (!T)
+    return make_error<StringError>(Msg, inconvertibleErrorCode());
+
+  SubtargetFeatures Features;
+  Features.getDefaultSubtargetFeatures(TT);
+
+  std::optional<Reloc::Model> RelocModel;
+  if (M.getModuleFlag("PIC Level"))
+    RelocModel =
+        M.getPICLevel() == PICLevel::NotPIC ? Reloc::Static : Reloc::PIC_;
+
+  std::optional<CodeModel::Model> CodeModel = M.getCodeModel();
+
+  TargetOptions Options = codegen::InitTargetOptionsFromCodeGenFlags(TT);
+
+  std::unique_ptr<TargetMachine> TM(
+      T->createTargetMachine(M.getTargetTriple(), CPU, Features.getString(),
+                             Options, RelocModel, CodeModel, CGOptLevel));
+  if (!TM)
+    return make_error<StringError>("Failed to create target machine",
+                                   inconvertibleErrorCode());
+  return TM;
+}
+
+///
+class JITEngine {
+public:
+  JITEngine(Triple::ArchType TA, std::string MCpu)
+      : TT(Triple::getArchTypeName(TA)), CPU(MCpu) {
+    std::call_once(InitFlag, init, TT);
+  }
+
+  /// Run jit compilation. It is expected to get a memory buffer containing the
+  /// generated device image that could be loaded to the device directly.
+  Expected<std::unique_ptr<MemoryBuffer>>
+  run(__tgt_device_image *Image, unsigned OptLevel,
+      jit::PostProcessingFn PostProcessing);
+
+private:
+  /// Run backend, which contains optimization and code generation.
+  Expected<std::unique_ptr<MemoryBuffer>> backend(Module &M, unsigned OptLevel);
+
+  /// Run optimization pipeline.
+  void opt(TargetMachine *TM, TargetLibraryInfoImpl *TLII, Module &M,
+           unsigned OptLevel);
+
+  /// Run code generation.
+  void codegen(TargetMachine *TM, TargetLibraryInfoImpl *TLII, Module &M,
+               raw_pwrite_stream &OS);
+
+  LLVMContext Context;
+  const Triple TT;
+  const std::string CPU;
+};
+
+void JITEngine::opt(TargetMachine *TM, TargetLibraryInfoImpl *TLII, Module &M,
+                    unsigned OptLevel) {
+  PipelineTuningOptions PTO;
+  std::optional<PGOOptions> PGOOpt;
+
+  LoopAnalysisManager LAM;
+  FunctionAnalysisManager FAM;
+  CGSCCAnalysisManager CGAM;
+  ModuleAnalysisManager MAM;
+  ModulePassManager MPM;
+
+  PassBuilder PB(TM, PTO, PGOOpt, nullptr);
+
+  FAM.registerPass([&] { return TargetLibraryAnalysis(*TLII); });
+
+  // Register all the basic analyses with the managers.
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+  MPM.addPass(PB.buildPerModuleDefaultPipeline(getOptLevel(OptLevel)));
+
+  MPM.run(M, MAM);
+}
+
+void JITEngine::codegen(TargetMachine *TM, TargetLibraryInfoImpl *TLII,
+                        Module &M, raw_pwrite_stream &OS) {
+  legacy::PassManager PM;
+  PM.add(new TargetLibraryInfoWrapperPass(*TLII));
+  MachineModuleInfoWrapperPass *MMIWP = new MachineModuleInfoWrapperPass(
+      reinterpret_cast<LLVMTargetMachine *>(TM));
+  TM->addPassesToEmitFile(PM, OS, nullptr,
+                          TT.isNVPTX() ? CGFT_AssemblyFile : CGFT_ObjectFile,
+                          /* DisableVerify */ false, MMIWP);
+
+  PM.run(M);
+}
+
+Expected<std::unique_ptr<MemoryBuffer>> JITEngine::backend(Module &M,
+                                                           unsigned OptLevel) {
+
+  auto RemarksFileOrErr = setupLLVMOptimizationRemarks(
+      Context, /* RemarksFilename */ "", /* RemarksPasses */ "",
+      /* RemarksFormat */ "", /* RemarksWithHotness */ false);
+  if (Error E = RemarksFileOrErr.takeError())
+    return std::move(E);
+  if (*RemarksFileOrErr)
+    (*RemarksFileOrErr)->keep();
+
+  auto TMOrErr = createTargetMachine(M, CPU, OptLevel);
+  if (!TMOrErr)
+    return TMOrErr.takeError();
+
+  std::unique_ptr<TargetMachine> TM = std::move(*TMOrErr);
+  TargetLibraryInfoImpl TLII(TT);
+
+  opt(TM.get(), &TLII, M, OptLevel);
+
+  // Prepare the output buffer and stream for codegen.
+  SmallVector<char> CGOutputBuffer;
+  raw_svector_ostream OS(CGOutputBuffer);
+
+  codegen(TM.get(), &TLII, M, OS);
+
+  return MemoryBuffer::getMemBufferCopy(OS.str());
+}
+
+Expected<std::unique_ptr<MemoryBuffer>>
+JITEngine::run(__tgt_device_image *Image, unsigned OptLevel,
+               jit::PostProcessingFn PostProcessing) {
+  auto ModOrErr = createModuleFromImage(Image, Context);
+  if (!ModOrErr)
+    return ModOrErr.takeError();
+
+  auto Mod = std::move(*ModOrErr);
+
+  auto MBOrError = backend(*Mod, OptLevel);
+  if (!MBOrError)
+    return MBOrError.takeError();
+
+  return PostProcessing(std::move(*MBOrError));
+}
+
+/// A map from a bitcode image start address to its corresponding triple. If the
+/// image is not in the map, it is not a bitcode image.
+DenseMap<void *, Triple::ArchType> BitcodeImageMap;
+
+/// Output images generated from LLVM backend.
+SmallVector<std::unique_ptr<MemoryBuffer>, 4> JITImages;
+
+/// A list of __tgt_device_image images.
+std::list<__tgt_device_image> TgtImages;
+} // namespace
+
+namespace llvm {
+namespace omp {
+namespace jit {
+bool checkBitcodeImage(__tgt_device_image *Image, Triple::ArchType TA) {
+  TimeTraceScope TimeScope("Check bitcode image");
+
+  {
+    auto Itr = BitcodeImageMap.find(Image->ImageStart);
+    if (Itr != BitcodeImageMap.end() && Itr->second == TA)
+      return true;
+  }
+
+  StringRef Data(reinterpret_cast<const char *>(Image->ImageStart),
+                 reinterpret_cast<char *>(Image->ImageEnd) -
+                     reinterpret_cast<char *>(Image->ImageStart));
+  std::unique_ptr<MemoryBuffer> MB = MemoryBuffer::getMemBuffer(
+      Data, /* BufferName */ "", /* RequiresNullTerminator */ false);
+  if (!MB)
+    return false;
+
+  Expected<object::IRSymtabFile> FOrErr = object::readIRSymtab(*MB);
+  if (!FOrErr) {
+    consumeError(FOrErr.takeError());
+    return false;
+  }
+
+  auto ActualTriple = FOrErr->TheReader.getTargetTriple();
+
+  if (Triple(ActualTriple).getArch() == TA) {
+    BitcodeImageMap[Image->ImageStart] = TA;
+    return true;
+  }
+
+  return false;
+}
+
+Expected<__tgt_device_image *> compile(__tgt_device_image *Image,
+                                       Triple::ArchType TA, std::string MCPU,
+                                       unsigned OptLevel,
+                                       PostProcessingFn PostProcessing) {
+  JITEngine J(TA, MCPU);
+
+  auto ImageMBOrErr = J.run(Image, OptLevel, PostProcessing);
+  if (!ImageMBOrErr)
+    return ImageMBOrErr.takeError();
+
+  JITImages.push_back(std::move(*ImageMBOrErr));
+  TgtImages.push_back(*Image);
+
+  auto &ImageMB = JITImages.back();
+  auto *NewImage = &TgtImages.back();
+
+  NewImage->ImageStart = (void *)ImageMB->getBufferStart();
+  NewImage->ImageEnd = (void *)ImageMB->getBufferEnd();
+
+  return NewImage;
+}
+
+} // namespace jit
+} // namespace omp
+} // namespace llvm
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.h
new file mode 100644
index 0000000..73483ce
--- /dev/null
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.h
@@ -0,0 +1,50 @@
+//===- JIT.h - Target independent JIT infrastructure ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_JIT_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_JIT_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Error.h"
+
+#include <functional>
+#include <memory>
+#include <string>
+
+struct __tgt_device_image;
+
+namespace llvm {
+class MemoryBuffer;
+
+namespace omp {
+namespace jit {
+
+/// Function type for a callback that will be called after the backend is
+/// called.
+using PostProcessingFn = std::function<Expected<std::unique_ptr<MemoryBuffer>>(
+    std::unique_ptr<MemoryBuffer>)>;
+
+/// Check if \p Image contains bitcode with triple \p Triple.
+bool checkBitcodeImage(__tgt_device_image *Image, Triple::ArchType TA);
+
+/// Compile the bitcode image \p Image and generate the binary image that can be
+/// loaded to the target device of the triple \p Triple architecture \p MCpu. \p
+/// PostProcessing will be called after codegen to handle cases such as assember
+/// as an external tool.
+Expected<__tgt_device_image *> compile(__tgt_device_image *Image,
+                                       Triple::ArchType TA, std::string MCpu,
+                                       unsigned OptLevel,
+                                       PostProcessingFn PostProcessing);
+} // namespace jit
+} // namespace omp
+} // namespace llvm
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_JIT_H
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 309e7e0..4d73ab6 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -11,6 +11,7 @@
 #include "PluginInterface.h"
 #include "Debug.h"
 #include "GlobalHandler.h"
+#include "JIT.h"
 #include "elf_common.h"
 #include "omptarget.h"
 #include "omptargetplugin.h"
@@ -629,7 +630,10 @@ int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *TgtImage) {
   if (!Plugin::isActive())
     return false;
 
-  return elf_check_machine(TgtImage, Plugin::get().getMagicElfBits());
+  if (elf_check_machine(TgtImage, Plugin::get().getMagicElfBits()))
+    return true;
+
+  return jit::checkBitcodeImage(TgtImage, Plugin::get().getTripleArch());
 }
 
 int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *TgtImage,
@@ -700,7 +704,37 @@ int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId,
 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
                                           __tgt_device_image *TgtImage) {
   GenericPluginTy &Plugin = Plugin::get();
-  auto TableOrErr = Plugin.getDevice(DeviceId).loadBinary(Plugin, TgtImage);
+  GenericDeviceTy &Device = Plugin.getDevice(DeviceId);
+
+  // If it is a bitcode image, we have to jit the binary image before loading to
+  // the device.
+  {
+    UInt32Envar JITOptLevel("LIBOMPTARGET_JIT_OPT_LEVEL", 3);
+    Triple::ArchType TA = Plugin.getTripleArch();
+    std::string Arch = Device.getArch();
+
+    jit::PostProcessingFn PostProcessing =
+        [&Device](std::unique_ptr<MemoryBuffer> MB)
+        -> Expected<std::unique_ptr<MemoryBuffer>> {
+      return Device.doJITPostProcessing(std::move(MB));
+    };
+
+    if (jit::checkBitcodeImage(TgtImage, TA)) {
+      auto TgtImageOrErr =
+          jit::compile(TgtImage, TA, Arch, JITOptLevel, PostProcessing);
+      if (!TgtImageOrErr) {
+        auto Err = TgtImageOrErr.takeError();
+        REPORT("Failure to jit binary image from bitcode image %p on device "
+               "%d: %s\n",
+               TgtImage, DeviceId, toString(std::move(Err)).data());
+        return nullptr;
+      }
+
+      TgtImage = *TgtImageOrErr;
+    }
+  }
+
+  auto TableOrErr = Device.loadBinary(Plugin, TgtImage);
   if (!TableOrErr) {
     auto Err = TableOrErr.takeError();
     REPORT("Failure to load binary image %p on device %d: %s\n", TgtImage,
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index 45b0fb0..9f25301 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -26,6 +26,7 @@
 #include "omptarget.h"
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 #include "llvm/Support/Allocator.h"
@@ -377,6 +378,17 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   }
   uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
 
+  /// Get target architecture.
+  virtual std::string getArch() const {
+    llvm_unreachable("device doesn't support JIT");
+  }
+
+  /// Post processing after jit backend. The ownership of \p MB will be taken.
+  virtual Expected<std::unique_ptr<MemoryBuffer>>
+  doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const {
+    return MB;
+  }
+
 private:
   /// Register offload entry for global variable.
   Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
@@ -526,6 +538,11 @@ struct GenericPluginTy {
   /// Get the ELF code to recognize the binary image of this plugin.
   virtual uint16_t getMagicElfBits() const = 0;
 
+  /// Get the target triple of this plugin.
+  virtual Triple::ArchType getTripleArch() const {
+    llvm_unreachable("target doesn't support jit");
+  }
+
   /// Allocate a structure using the internal allocator.
   template <typename Ty> Ty *allocate() {
     return reinterpret_cast<Ty *>(Allocator.Allocate(sizeof(Ty), alignof(Ty)));
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index 8ab6089..5b9fc77 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -278,6 +278,14 @@ struct CUDADeviceTy : public GenericDeviceTy {
                                  GridValues.GV_Warp_Size))
       return Err;
 
+    if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                                 ComputeCapability.Major))
+      return Err;
+
+    if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                                 ComputeCapability.Minor))
+      return Err;
+
     return Plugin::success();
   }
 
@@ -794,6 +802,9 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::check(Res, "Error in cuDeviceGetAttribute: %s");
   }
 
+  /// See GenericDeviceTy::getArch().
+  std::string getArch() const override { return ComputeCapability.str(); }
+
 private:
   using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>;
   using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>;
@@ -810,6 +821,15 @@ private:
 
   /// The CUDA device handler.
   CUdevice Device = CU_DEVICE_INVALID;
+
+  /// The compute capability of the corresponding CUDA device.
+  struct ComputeCapabilityTy {
+    uint32_t Major;
+    uint32_t Minor;
+    std::string str() const {
+      return "sm_" + std::to_string(Major * 10 + Minor);
+    }
+  } ComputeCapability;
 };
 
 Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
@@ -908,6 +928,11 @@ struct CUDAPluginTy final : public GenericPluginTy {
   /// Get the ELF code for recognizing the compatible image binary.
   uint16_t getMagicElfBits() const override { return ELF::EM_CUDA; }
 
+  Triple::ArchType getTripleArch() const override {
+    // TODO: I think we can drop the support for 32-bit NVPTX devices.
+    return Triple::nvptx64;
+  }
+
   /// Check whether the image is compatible with the available CUDA devices.
   Expected<bool> isImageCompatible(__tgt_image_info *Info) const override {
     for (int32_t DevId = 0; DevId < getNumDevices(); ++DevId) {
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index 56ed371..ed6897a 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -364,6 +364,10 @@ struct GenELF64PluginTy final : public GenericPluginTy {
   Expected<bool> isImageCompatible(__tgt_image_info *Info) const override {
     return true;
   }
+
+  Triple::ArchType getTripleArch() const override {
+    return Triple::LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE;
+  }
 };
 
 GenericPluginTy *Plugin::createPlugin() { return new GenELF64PluginTy(); }
diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index e000d23..e359f9c 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -34,6 +34,15 @@ def append_dynamic_library_path(name, value, sep):
     else:
         config.environment[name] = value
 
+# Evalute the environment variable which is a string boolean value.
+def evaluate_bool_env(env):
+    env = env.lower()
+    possible_true_values = ["on", "true", "1"]
+    for v in possible_true_values:
+        if env == v:
+            return True
+    return False
+
 # name: The name of this test suite.
 config.name = 'libomptarget :: ' + config.libomptarget_current_target
 
@@ -111,10 +120,17 @@ else: # Unices
         config.test_flags += " --libomptarget-nvptx-bc-path=" + config.library_dir
     if config.libomptarget_current_target.endswith('-LTO'):
         config.test_flags += " -foffload-lto"
+    if config.libomptarget_current_target.endswith('-JIT-LTO') and evaluate_bool_env(
+        config.environment['LIBOMPTARGET_NEXTGEN_PLUGINS']
+    ):
+        config.test_flags += " -foffload-lto"
+        config.test_flags += " -Wl,--embed-bitcode"
 
 def remove_suffix_if_present(name):
     if name.endswith('-LTO'):
         return name[:-4]
+    elif name.endswith('-JIT-LTO'):
+        return name[:-8]
     else:
         return name
 
-- 
2.7.4