[OpenMP] Support kernel record and replay
authorGiorgis Georgakoudis <georgakoudis1@llnl.gov>
Tue, 17 Jan 2023 23:35:44 +0000 (15:35 -0800)
committerGiorgis Georgakoudis <georgakoudis1@llnl.gov>
Wed, 18 Jan 2023 00:29:03 +0000 (16:29 -0800)
This patch adds functionality for recording and replaying the execution of OpenMP offload kernels, based on an original implementation by Steve Rangel. The patch extends libomptarget to extract a json description of the kernel, the device image binary, and a device memory snapshot before and after the execution of a recorded kernel. Kernel recording/replaying in libomptarget is controlled through env vars (LIBOMPTARGET_RECORD, LIBOMPTARGET_REPLAY). It provides a tool, llvm-omp-kernel-replay, for replaying a kernel using the extracted information with the ability to verify replayed execution using the post-execution device memory snapshot, also supporting changing the number of teams/threads for replaying.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D138931

openmp/libomptarget/include/omptarget.h
openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
openmp/libomptarget/src/exports
openmp/libomptarget/src/interface.cpp
openmp/libomptarget/src/omptarget.cpp
openmp/libomptarget/src/private.h
openmp/libomptarget/tools/CMakeLists.txt
openmp/libomptarget/tools/kernelreplay/CMakeLists.txt [new file with mode: 0644]
openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp [new file with mode: 0644]

index 93f6800..76b6867 100644 (file)
@@ -416,6 +416,14 @@ int __tgt_target_kernel_nowait(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
 // data.
 void __tgt_target_nowait_query(void **AsyncHandle);
 
+/// Executes a target kernel by replaying recorded kernel arguments and
+/// device memory.
+int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr,
+                               void *DeviceMemory, int64_t DeviceMemorySize,
+                               void **TgtArgs, ptrdiff_t *TgtOffsets,
+                               int32_t NumArgs, int32_t NumTeams,
+                               int32_t ThreadLimit, uint64_t LoopTripCount);
+
 void __tgt_set_info_flag(uint32_t);
 
 int __tgt_print_device_info(int64_t DeviceId);
index 5d160ad..d20fbf6 100644 (file)
@@ -18,6 +18,8 @@
 
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/MemoryBuffer.h"
 
 #include <cstdint>
 #include <limits>
@@ -29,6 +31,168 @@ using namespace plugin;
 
 GenericPluginTy *Plugin::SpecificPlugin = nullptr;
 
+// TODO: Fix any thread safety issues for multi-threaded kernel recording.
+struct RecordReplayTy {
+private:
+  // Memory pointers for recording, replaying memory.
+  void *MemoryStart;
+  void *MemoryPtr;
+  size_t MemorySize;
+  GenericDeviceTy *Device;
+  std::mutex AllocationLock;
+
+  // Environment variables for record and replay.
+  // Enables recording kernels if set.
+  BoolEnvar OMPX_RecordKernel;
+  // Enables replaying a kernel if set.
+  BoolEnvar OMPX_ReplayKernel;
+  // Enables saving the device memory kernel output post execution if set.
+  BoolEnvar OMPX_ReplaySaveOutput;
+  // Sets the maximum to pre-allocate device memory.
+  UInt32Envar OMPX_DeviceMemorySize;
+
+  // Record/replay pre-allocates the largest possible device memory using the
+  // default kind.
+  // TODO: Expand allocation to include other kinds (device, host, shared) and
+  // possibly use a MemoryManager to track (de-)allocations for
+  // storing/retrieving when recording/replaying.
+  Error preallocateDeviceMemory() {
+    // Pre-allocate memory on device. Starts with 64GB and subtracts in steps
+    // of 1GB until allocation succeeds.
+    const size_t MAX_MEMORY_ALLOCATION =
+        OMPX_DeviceMemorySize * 1024 * 1024 * 1024ULL;
+    constexpr size_t STEP = 1024 * 1024 * 1024ULL;
+    MemoryStart = nullptr;
+    for (size_t Try = MAX_MEMORY_ALLOCATION; Try > 0; Try -= STEP) {
+      MemoryStart =
+          Device->allocate(Try, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
+      if (MemoryStart)
+        break;
+    }
+
+    if (!MemoryStart)
+      return Plugin::error("Allocating record/replay memory");
+
+    MemoryPtr = MemoryStart;
+    MemorySize = 0;
+
+    return Plugin::success();
+  }
+
+  void dumpDeviceMemory(StringRef Filename,
+                        AsyncInfoWrapperTy &AsyncInfoWrapper) {
+    ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
+        WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize);
+    if (!DeviceMemoryMB)
+      report_fatal_error("Error creating MemoryBuffer for device memory");
+
+    auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(),
+                                    MemoryStart, MemorySize, AsyncInfoWrapper);
+    if (Err)
+      report_fatal_error("Error retrieving data for target pointer");
+
+    StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), MemorySize);
+    std::error_code EC;
+    raw_fd_ostream OS(Filename, EC);
+    if (EC)
+      report_fatal_error("Error dumping memory to file " + Filename + " :" +
+                         EC.message());
+    OS << DeviceMemory;
+    OS.close();
+  }
+
+public:
+  bool isRecording() const { return OMPX_RecordKernel; }
+  bool isReplaying() const { return OMPX_ReplayKernel; }
+  bool isRecordingOrReplaying() const {
+    return (OMPX_RecordKernel || OMPX_ReplayKernel);
+  }
+  bool isSaveOutputEnabled() const { return OMPX_ReplaySaveOutput; }
+
+  RecordReplayTy()
+      : OMPX_RecordKernel("LIBOMPTARGET_RECORD"),
+        OMPX_ReplayKernel("LIBOMPTARGET_REPLAY"),
+        OMPX_ReplaySaveOutput("LIBOMPTARGET_RR_SAVE_OUTPUT"),
+        OMPX_DeviceMemorySize("LIBOMPTARGET_RR_DEVMEM_SIZE",
+                              /* Default in GB */ 64) {}
+
+  void saveImage(const char *Name, DeviceImageTy &Image) {
+    Twine ImageName = Twine(Name) + Twine(".image");
+    std::error_code EC;
+    raw_fd_ostream OS(ImageName.str(), EC);
+    if (EC)
+      report_fatal_error("Error saving image : " + StringRef(EC.message()));
+    OS << Image.getMemoryBuffer().getBuffer();
+    OS.close();
+  }
+
+  void saveKernelInputInfo(const char *Name, void **ArgPtrs,
+                           ptrdiff_t *ArgOffsets, int32_t NumArgs,
+                           uint64_t NumTeamsClause, uint32_t ThreadLimitClause,
+                           uint64_t LoopTripCount,
+                           AsyncInfoWrapperTy &AsyncInfoWrapper) {
+    json::Object JsonKernelInfo;
+    JsonKernelInfo["Name"] = Name;
+    JsonKernelInfo["NumArgs"] = NumArgs;
+    JsonKernelInfo["NumTeamsClause"] = NumTeamsClause;
+    JsonKernelInfo["ThreadLimitClause"] = ThreadLimitClause;
+    JsonKernelInfo["LoopTripCount"] = LoopTripCount;
+    JsonKernelInfo["DeviceMemorySize"] = MemorySize;
+    JsonKernelInfo["DeviceId"] = Device->getDeviceId();
+
+    json::Array JsonArgPtrs;
+    for (int I = 0; I < NumArgs; ++I)
+      JsonArgPtrs.push_back((intptr_t)ArgPtrs[I]);
+    JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs));
+
+    json::Array JsonArgOffsets;
+    for (int I = 0; I < NumArgs; ++I)
+      JsonArgOffsets.push_back(ArgOffsets[I]);
+    JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets));
+
+    Twine KernelName(Name);
+    Twine MemoryFilename = KernelName + ".memory";
+    dumpDeviceMemory(MemoryFilename.str(), AsyncInfoWrapper);
+
+    Twine JsonFilename = KernelName + ".json";
+    std::error_code EC;
+    raw_fd_ostream JsonOS(JsonFilename.str(), EC);
+    if (EC)
+      report_fatal_error("Error saving kernel json file : " +
+                         StringRef(EC.message()));
+    JsonOS << json::Value(std::move(JsonKernelInfo));
+    JsonOS.close();
+  }
+
+  void saveKernelOutputInfo(const char *Name,
+                            AsyncInfoWrapperTy &AsyncInfoWrapper) {
+    Twine OutputFilename =
+        Twine(Name) + (isRecording() ? ".original.output" : ".replay.output");
+    dumpDeviceMemory(OutputFilename.str(), AsyncInfoWrapper);
+  }
+
+  void *alloc(uint64_t Size) {
+    assert(MemoryStart && "Expected memory has been pre-allocated");
+    void *Alloc = nullptr;
+    constexpr int ALIGN = 16;
+    // Assumes alignment is a power of 2.
+    int64_t AlignedSize = Size + (ALIGN - 1) & (~(ALIGN - 1));
+    std::lock_guard<std::mutex> LG(AllocationLock);
+    Alloc = MemoryPtr;
+    MemoryPtr = (char *)MemoryPtr + AlignedSize;
+    MemorySize += AlignedSize;
+    return Alloc;
+  }
+
+  Error init(GenericDeviceTy *Device) {
+    this->Device = Device;
+    return preallocateDeviceMemory();
+  }
+
+  void deinit() { Device->free(MemoryStart); }
+
+} RecordReplay;
+
 AsyncInfoWrapperTy::~AsyncInfoWrapperTy() {
   // If we used a local async info object we want synchronous behavior.
   // In that case, and assuming the current status code is OK, we will
@@ -45,6 +209,9 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
 
   DynamicMemorySize = GenericDevice.getDynamicMemorySize();
 
+  if (RecordReplay.isRecording())
+    RecordReplay.saveImage(Name, Image);
+
   return initImpl(GenericDevice, Image);
 }
 
@@ -197,6 +364,10 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
   if (EnableMM)
     MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
 
+  if (RecordReplay.isRecordingOrReplaying())
+    if (auto Err = RecordReplay.init(this))
+      return Err;
+
   return Plugin::success();
 }
 
@@ -207,6 +378,9 @@ Error GenericDeviceTy::deinit() {
     delete MemoryManager;
   MemoryManager = nullptr;
 
+  if (RecordReplay.isRecordingOrReplaying())
+    RecordReplay.deinit();
+
   return deinitImpl();
 }
 
@@ -437,6 +611,9 @@ Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
                                             TargetAllocTy Kind) {
   void *Alloc = nullptr;
 
+  if (RecordReplay.isRecordingOrReplaying())
+    return RecordReplay.alloc(Size);
+
   switch (Kind) {
   case TARGET_ALLOC_DEFAULT:
   case TARGET_ALLOC_DEVICE:
@@ -469,6 +646,10 @@ Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
 }
 
 Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
+  // Free is a noop when recording or replaying.
+  if (RecordReplay.isRecordingOrReplaying())
+    return Plugin::success();
+
   int Res;
   if (MemoryManager)
     Res = MemoryManager->free(TgtPtr);
@@ -521,9 +702,20 @@ Error GenericDeviceTy::runTargetTeamRegion(
   GenericKernelTy &GenericKernel =
       *reinterpret_cast<GenericKernelTy *>(EntryPtr);
 
+  if (RecordReplay.isRecording())
+    RecordReplay.saveKernelInputInfo(
+        GenericKernel.getName(), ArgPtrs, ArgOffsets, NumArgs, NumTeamsClause,
+        ThreadLimitClause, LoopTripCount, AsyncInfoWrapper);
+
   Err =
       GenericKernel.launch(*this, ArgPtrs, ArgOffsets, NumArgs, NumTeamsClause,
                            ThreadLimitClause, LoopTripCount, AsyncInfoWrapper);
+
+  if (RecordReplay.isRecordingOrReplaying() &&
+      RecordReplay.isSaveOutputEnabled())
+    RecordReplay.saveKernelOutputInfo(GenericKernel.getName(),
+                                      AsyncInfoWrapper);
+
   return Err;
 }
 
index 5fa013d..42682ab 100644 (file)
@@ -27,6 +27,7 @@ VERS1.0 {
     __tgt_target_kernel;
     __tgt_target_kernel_nowait;
     __tgt_target_nowait_query;
+    __tgt_target_kernel_replay;
     __tgt_mapper_num_components;
     __tgt_push_mapper_component;
     __kmpc_push_target_tripcount;
index bee3d5b..3871e68 100644 (file)
@@ -265,6 +265,48 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                                    HostPtr, Args);
 }
 
+/// Implements a target kernel entry that replays a pre-recorded kernel.
+/// \param Loc Source location associated with this target region (unused).
+/// \param DeviceId The device identifier to execute the target region.
+/// \param HostPtr A pointer to an address that uniquely identifies the kernel.
+/// \param DeviceMemory A pointer to an array storing device memory data to move
+///                     prior to kernel execution.
+/// \param DeviceMemorySize The size of the above device memory data in bytes.
+/// \param TgtArgs An array of pointers of the pre-recorded target kernel
+///                arguments.
+/// \param TgtOffsets An array of pointers of the pre-recorded target kernel
+///                   argument offsets.
+/// \param NumArgs The number of kernel arguments.
+/// \param NumTeams Number of teams to launch the target region with.
+/// \param ThreadLimit Limit to the number of threads to use in kernel
+///                    execution.
+/// \param LoopTripCount The pre-recorded value of the loop tripcount, if any.
+/// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure.
+EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
+                                      void *HostPtr, void *DeviceMemory,
+                                      int64_t DeviceMemorySize, void **TgtArgs,
+                                      ptrdiff_t *TgtOffsets, int32_t NumArgs,
+                                      int32_t NumTeams, int32_t ThreadLimit,
+                                      uint64_t LoopTripCount) {
+
+  if (checkDeviceAndCtors(DeviceId, Loc)) {
+    DP("Not offloading to device %" PRId64 "\n", DeviceId);
+    return OMP_TGT_FAIL;
+  }
+  DeviceTy &Device = *PM->Devices[DeviceId];
+
+  AsyncInfoTy AsyncInfo(Device);
+  int Rc = target_replay(Loc, Device, HostPtr, DeviceMemory, DeviceMemorySize,
+                         TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit,
+                         LoopTripCount, AsyncInfo);
+  if (Rc == OFFLOAD_SUCCESS)
+    Rc = AsyncInfo.synchronize();
+  handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
+  assert(Rc == OFFLOAD_SUCCESS &&
+         "__tgt_target_kernel_replay unexpected failure!");
+  return OMP_TGT_SUCCESS;
+}
+
 EXTERN int __tgt_target_kernel_nowait(
     ident_t *Loc, int64_t DeviceId, int32_t NumTeams, int32_t ThreadLimit,
     void *HostPtr, __tgt_kernel_arguments *Args, int32_t DepNum, void *DepList,
index 3476e2d..27eca02 100644 (file)
@@ -1714,3 +1714,53 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
 
   return OFFLOAD_SUCCESS;
 }
+
+/// Executes a kernel using pre-recorded information for loading to
+/// device memory to launch the target kernel with the pre-recorded
+/// configuration.
+int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
+                  void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs,
+                  ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
+                  int32_t ThreadLimit, uint64_t LoopTripCount,
+                  AsyncInfoTy &AsyncInfo) {
+  int32_t DeviceId = Device.DeviceID;
+  TableMap *TM = getTableMap(HostPtr);
+  // Fail if the table map fails to find the target kernel pointer for the
+  // provided host pointer.
+  if (!TM) {
+    REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
+           DPxPTR(HostPtr));
+    return OFFLOAD_FAIL;
+  }
+
+  // Retrieve the target table of offloading entries.
+  __tgt_target_table *TargetTable = nullptr;
+  {
+    std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
+    assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
+           "Not expecting a device ID outside the table's bounds!");
+    TargetTable = TM->Table->TargetsTable[DeviceId];
+  }
+  assert(TargetTable && "Global data has not been mapped\n");
+
+  // Retrieve the target kernel pointer, allocate and store the recorded device
+  // memory data, and launch device execution.
+  void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
+  DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
+     TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
+
+  void *TgtPtr = Device.allocData(DeviceMemorySize, /* HstPtr */ nullptr,
+                                  TARGET_ALLOC_DEFAULT);
+  Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
+
+  int Ret =
+      Device.runTeamRegion(TgtEntryPtr, TgtArgs, TgtOffsets, NumArgs, NumTeams,
+                           ThreadLimit, LoopTripCount, AsyncInfo);
+
+  if (Ret != OFFLOAD_SUCCESS) {
+    REPORT("Executing target region abort target.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  return OFFLOAD_SUCCESS;
+}
index 6fc47f8..521f39f 100644 (file)
@@ -45,6 +45,12 @@ extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
                   uint64_t Tripcount, int IsTeamConstruct,
                   AsyncInfoTy &AsyncInfo);
 
+extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
+                         void *DeviceMemory, int64_t DeviceMemorySize,
+                         void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
+                         int32_t NumTeams, int32_t ThreadLimit,
+                         uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo);
+
 extern void handleTargetOutcome(bool Success, ident_t *Loc);
 extern bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc);
 extern void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
index 9237035..a850647 100644 (file)
@@ -25,3 +25,4 @@ macro(add_openmp_tool_symlink name)
 endmacro()
 
 add_subdirectory(deviceinfo)
+add_subdirectory(kernelreplay)
diff --git a/openmp/libomptarget/tools/kernelreplay/CMakeLists.txt b/openmp/libomptarget/tools/kernelreplay/CMakeLists.txt
new file mode 100644 (file)
index 0000000..6f3dc33
--- /dev/null
@@ -0,0 +1,26 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build llvm-omp-kernel-replay tool
+#
+##===----------------------------------------------------------------------===##
+
+libomptarget_say("Building the llvm-omp-kernel-replay tool")
+
+add_openmp_tool(llvm-omp-kernel-replay llvm-omp-kernel-replay.cpp)
+
+llvm_update_compile_flags(llvm-omp-kernel-replay)
+
+target_include_directories(llvm-omp-kernel-replay PRIVATE
+  ${LIBOMPTARGET_INCLUDE_DIR}
+)
+target_link_libraries(llvm-omp-kernel-replay PRIVATE
+  LLVMSupport
+  omp
+  omptarget
+)
diff --git a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
new file mode 100644 (file)
index 0000000..1348e0f
--- /dev/null
@@ -0,0 +1,179 @@
+//===- llvm-omp-kernel-replay.cpp - Replay OpenMP offload kernel ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a command line utility to replay the execution of recorded OpenMP
+// offload kernels.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptargetplugin.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+cl::OptionCategory ReplayOptions("llvm-omp-kernel-replay Options");
+
+// InputFilename - The filename to read the json description of the kernel.
+static cl::opt<std::string> InputFilename(cl::Positional,
+                                          cl::desc("<input kernel json file>"),
+                                          cl::Required);
+
+static cl::opt<bool> VerifyOpt(
+    "verify",
+    cl::desc(
+        "Verify device memory post execution against the original output."),
+    cl::init(false), cl::cat(ReplayOptions));
+
+static cl::opt<bool> SaveOutputOpt(
+    "save-output",
+    cl::desc("Save the device memory output of the replayed kernel execution."),
+    cl::init(false), cl::cat(ReplayOptions));
+
+static cl::opt<unsigned> NumTeamsOpt("num-teams",
+                                     cl::desc("Set the number of teams."),
+                                     cl::init(0), cl::cat(ReplayOptions));
+
+static cl::opt<unsigned> NumThreadsOpt("num-threads",
+                                       cl::desc("Set the number of threads."),
+                                       cl::init(0), cl::cat(ReplayOptions));
+
+static cl::opt<int32_t> DeviceIdOpt("device-id", cl::desc("Set the device id."),
+                                    cl::init(-1), cl::cat(ReplayOptions));
+
+int main(int argc, char **argv) {
+  cl::HideUnrelatedOptions(ReplayOptions);
+  cl::ParseCommandLineOptions(argc, argv, "llvm-omp-kernel-replay\n");
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> KernelInfoMB =
+      MemoryBuffer::getFile(InputFilename, /* isText */ true,
+                            /* RequiresNullTerminator */ true);
+  if (!KernelInfoMB)
+    report_fatal_error("Error reading the kernel info json file");
+  Expected<json::Value> JsonKernelInfo =
+      json::parse(KernelInfoMB.get()->getBuffer());
+  if (auto Err = JsonKernelInfo.takeError())
+    report_fatal_error("Cannot parse the kernel info json file");
+
+  auto NumTeamsJson =
+      JsonKernelInfo->getAsObject()->getInteger("NumTeamsClause");
+  unsigned NumTeams = (NumTeamsOpt > 0 ? NumTeamsOpt : NumTeamsJson.value());
+  auto NumThreadsJson =
+      JsonKernelInfo->getAsObject()->getInteger("ThreadLimitClause");
+  unsigned NumThreads =
+      (NumThreadsOpt > 0 ? NumThreadsOpt : NumThreadsJson.value());
+  // TODO: Print a warning if number of teams/threads is explicitly set in the
+  // kernel info but overriden through command line options.
+  auto LoopTripCount =
+      JsonKernelInfo->getAsObject()->getInteger("LoopTripCount");
+  auto KernelFunc = JsonKernelInfo->getAsObject()->getString("Name");
+
+  SmallVector<void *> TgtArgs;
+  SmallVector<ptrdiff_t> TgtArgOffsets;
+  auto NumArgs = JsonKernelInfo->getAsObject()->getInteger("NumArgs");
+  auto *TgtArgsArray = JsonKernelInfo->getAsObject()->getArray("ArgPtrs");
+  for (auto It : *TgtArgsArray)
+    TgtArgs.push_back(reinterpret_cast<void *>(It.getAsInteger().value()));
+  auto *TgtArgOffsetsArray =
+      JsonKernelInfo->getAsObject()->getArray("ArgOffsets");
+  for (auto It : *TgtArgOffsetsArray)
+    TgtArgOffsets.push_back(
+        reinterpret_cast<ptrdiff_t>(It.getAsInteger().value()));
+
+  __tgt_offload_entry KernelEntry = {nullptr, nullptr, 0, 0, 0};
+  std::string KernelEntryName = KernelFunc.value().str();
+  KernelEntry.name = const_cast<char *>(KernelEntryName.c_str());
+  // Anything non-zero works to uniquely identify the kernel.
+  KernelEntry.addr = (void *)0x1;
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> ImageMB =
+      MemoryBuffer::getFile(KernelEntryName + ".image", /* isText */ false,
+                            /* RequiresNullTerminator */ false);
+  if (!ImageMB)
+    report_fatal_error("Error reading the kernel image.");
+
+  __tgt_device_image DeviceImage;
+  DeviceImage.ImageStart = (void *)ImageMB.get()->getBufferStart();
+  DeviceImage.ImageEnd = (void *)ImageMB.get()->getBufferEnd();
+  DeviceImage.EntriesBegin = &KernelEntry;
+  DeviceImage.EntriesEnd = &KernelEntry + 1;
+
+  __tgt_bin_desc Desc;
+  Desc.NumDeviceImages = 1;
+  Desc.HostEntriesBegin = &KernelEntry;
+  Desc.HostEntriesEnd = &KernelEntry + 1;
+  Desc.DeviceImages = &DeviceImage;
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> DeviceMemoryMB =
+      MemoryBuffer::getFile(KernelEntryName + ".memory", /* isText */ false,
+                            /* RequiresNullTerminator */ false);
+  if (!DeviceMemoryMB)
+    report_fatal_error("Error reading the kernel input device memory.");
+
+  setenv("LIBOMPTARGET_REPLAY", "1", 1);
+  if (VerifyOpt || SaveOutputOpt)
+    setenv("LIBOMPTARGET_RR_SAVE_OUTPUT", "1", 1);
+
+  auto DeviceMemorySizeJson =
+      JsonKernelInfo->getAsObject()->getInteger("DeviceMemorySize");
+  // Set device memory size to the ceiling of GB granularity.
+  uint64_t DeviceMemorySize =
+      std::ceil(DeviceMemorySizeJson.value() / (1024.0 * 1024.0 * 1024.0));
+
+  setenv("LIBOMPTARGET_RR_DEVMEM_SIZE",
+         std::to_string(DeviceMemorySize).c_str(), 1);
+
+  auto DeviceIdJson = JsonKernelInfo->getAsObject()->getInteger("DeviceId");
+  // TODO: Print warning if the user overrides the device id in the json file.
+  int32_t DeviceId = (DeviceIdOpt > -1 ? DeviceIdOpt : DeviceIdJson.value());
+
+  // TODO: do we need requires?
+  //__tgt_register_requires(/* Flags */1);
+
+  __tgt_init_all_rtls();
+
+  __tgt_register_lib(&Desc);
+
+  __tgt_target_kernel_replay(
+      /* Loc */ nullptr, DeviceId, KernelEntry.addr,
+      (void *)DeviceMemoryMB.get()->getBuffer().data(),
+      DeviceMemoryMB.get()->getBufferSize(), TgtArgs.data(),
+      TgtArgOffsets.data(), NumArgs.value(), NumTeams, NumThreads,
+      LoopTripCount.value());
+
+  if (VerifyOpt) {
+    ErrorOr<std::unique_ptr<MemoryBuffer>> OriginalOutputMB =
+        MemoryBuffer::getFile(KernelEntryName + ".original.output",
+                              /* isText */ false,
+                              /* RequiresNullTerminator */ false);
+    if (!OriginalOutputMB)
+      report_fatal_error("Error reading the kernel original output file, make "
+                         "sure LIBOMPTARGET_SAVE_OUTPUT is set when recording");
+    ErrorOr<std::unique_ptr<MemoryBuffer>> ReplayOutputMB =
+        MemoryBuffer::getFile(KernelEntryName + ".replay.output",
+                              /* isText */ false,
+                              /* RequiresNullTerminator */ false);
+    if (!ReplayOutputMB)
+      report_fatal_error("Error reading the kernel replay output file");
+
+    StringRef OriginalOutput = OriginalOutputMB.get()->getBuffer();
+    StringRef ReplayOutput = ReplayOutputMB.get()->getBuffer();
+    if (OriginalOutput == ReplayOutput)
+      outs() << "[llvm-omp-kernel-replay] Replay device memory verified!\n";
+    else
+      outs() << "[llvm-omp-kernel-replay] Replay device memory failed to "
+                "verify!\n";
+  }
+  // TODO: calling unregister lib causes plugin deinit error for nextgen
+  // plugins.
+  //__tgt_unregister_lib(&Desc);
+
+  return 0;
+}