This patch adds functionality for recording and replaying the execution of OpenMP offload kernels, based on an original implementation by Steve Rangel. The patch extends libomptarget to extract a json description of the kernel, the device image binary, and a device memory snapshot before and after the execution of a recorded kernel. Kernel recording/replaying in libomptarget is controlled through env vars (LIBOMPTARGET_RECORD, LIBOMPTARGET_REPLAY). It provides a tool, llvm-omp-kernel-replay, for replaying a kernel using the extracted information with the ability to verify replayed execution using the post-execution device memory snapshot, also supporting changing the number of teams/threads for replaying.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D138931
// data.
void __tgt_target_nowait_query(void **AsyncHandle);
+/// Executes a target kernel by replaying recorded kernel arguments and
+/// device memory.
+int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr,
+ void *DeviceMemory, int64_t DeviceMemorySize,
+ void **TgtArgs, ptrdiff_t *TgtOffsets,
+ int32_t NumArgs, int32_t NumTeams,
+ int32_t ThreadLimit, uint64_t LoopTripCount);
+
void __tgt_set_info_flag(uint32_t);
int __tgt_print_device_info(int64_t DeviceId);
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Support/Error.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/MemoryBuffer.h"
#include <cstdint>
#include <limits>
GenericPluginTy *Plugin::SpecificPlugin = nullptr;
+// TODO: Fix any thread safety issues for multi-threaded kernel recording.
+struct RecordReplayTy {
+private:
+ // Memory pointers for recording, replaying memory.
+ void *MemoryStart;
+ void *MemoryPtr;
+ size_t MemorySize;
+ GenericDeviceTy *Device;
+ std::mutex AllocationLock;
+
+ // Environment variables for record and replay.
+ // Enables recording kernels if set.
+ BoolEnvar OMPX_RecordKernel;
+ // Enables replaying a kernel if set.
+ BoolEnvar OMPX_ReplayKernel;
+ // Enables saving the device memory kernel output post execution if set.
+ BoolEnvar OMPX_ReplaySaveOutput;
+ // Sets the maximum to pre-allocate device memory.
+ UInt32Envar OMPX_DeviceMemorySize;
+
+ // Record/replay pre-allocates the largest possible device memory using the
+ // default kind.
+ // TODO: Expand allocation to include other kinds (device, host, shared) and
+ // possibly use a MemoryManager to track (de-)allocations for
+ // storing/retrieving when recording/replaying.
+ Error preallocateDeviceMemory() {
+ // Pre-allocate memory on device. Starts with 64GB and subtracts in steps
+ // of 1GB until allocation succeeds.
+ const size_t MAX_MEMORY_ALLOCATION =
+ OMPX_DeviceMemorySize * 1024 * 1024 * 1024ULL;
+ constexpr size_t STEP = 1024 * 1024 * 1024ULL;
+ MemoryStart = nullptr;
+ for (size_t Try = MAX_MEMORY_ALLOCATION; Try > 0; Try -= STEP) {
+ MemoryStart =
+ Device->allocate(Try, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
+ if (MemoryStart)
+ break;
+ }
+
+ if (!MemoryStart)
+ return Plugin::error("Allocating record/replay memory");
+
+ MemoryPtr = MemoryStart;
+ MemorySize = 0;
+
+ return Plugin::success();
+ }
+
+ void dumpDeviceMemory(StringRef Filename,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) {
+ ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
+ WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize);
+ if (!DeviceMemoryMB)
+ report_fatal_error("Error creating MemoryBuffer for device memory");
+
+ auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(),
+ MemoryStart, MemorySize, AsyncInfoWrapper);
+ if (Err)
+ report_fatal_error("Error retrieving data for target pointer");
+
+ StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), MemorySize);
+ std::error_code EC;
+ raw_fd_ostream OS(Filename, EC);
+ if (EC)
+ report_fatal_error("Error dumping memory to file " + Filename + " :" +
+ EC.message());
+ OS << DeviceMemory;
+ OS.close();
+ }
+
+public:
+ bool isRecording() const { return OMPX_RecordKernel; }
+ bool isReplaying() const { return OMPX_ReplayKernel; }
+ bool isRecordingOrReplaying() const {
+ return (OMPX_RecordKernel || OMPX_ReplayKernel);
+ }
+ bool isSaveOutputEnabled() const { return OMPX_ReplaySaveOutput; }
+
+ RecordReplayTy()
+ : OMPX_RecordKernel("LIBOMPTARGET_RECORD"),
+ OMPX_ReplayKernel("LIBOMPTARGET_REPLAY"),
+ OMPX_ReplaySaveOutput("LIBOMPTARGET_RR_SAVE_OUTPUT"),
+ OMPX_DeviceMemorySize("LIBOMPTARGET_RR_DEVMEM_SIZE",
+ /* Default in GB */ 64) {}
+
+ void saveImage(const char *Name, DeviceImageTy &Image) {
+ Twine ImageName = Twine(Name) + Twine(".image");
+ std::error_code EC;
+ raw_fd_ostream OS(ImageName.str(), EC);
+ if (EC)
+ report_fatal_error("Error saving image : " + StringRef(EC.message()));
+ OS << Image.getMemoryBuffer().getBuffer();
+ OS.close();
+ }
+
+ void saveKernelInputInfo(const char *Name, void **ArgPtrs,
+ ptrdiff_t *ArgOffsets, int32_t NumArgs,
+ uint64_t NumTeamsClause, uint32_t ThreadLimitClause,
+ uint64_t LoopTripCount,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) {
+ json::Object JsonKernelInfo;
+ JsonKernelInfo["Name"] = Name;
+ JsonKernelInfo["NumArgs"] = NumArgs;
+ JsonKernelInfo["NumTeamsClause"] = NumTeamsClause;
+ JsonKernelInfo["ThreadLimitClause"] = ThreadLimitClause;
+ JsonKernelInfo["LoopTripCount"] = LoopTripCount;
+ JsonKernelInfo["DeviceMemorySize"] = MemorySize;
+ JsonKernelInfo["DeviceId"] = Device->getDeviceId();
+
+ json::Array JsonArgPtrs;
+ for (int I = 0; I < NumArgs; ++I)
+ JsonArgPtrs.push_back((intptr_t)ArgPtrs[I]);
+ JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs));
+
+ json::Array JsonArgOffsets;
+ for (int I = 0; I < NumArgs; ++I)
+ JsonArgOffsets.push_back(ArgOffsets[I]);
+ JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets));
+
+ Twine KernelName(Name);
+ Twine MemoryFilename = KernelName + ".memory";
+ dumpDeviceMemory(MemoryFilename.str(), AsyncInfoWrapper);
+
+ Twine JsonFilename = KernelName + ".json";
+ std::error_code EC;
+ raw_fd_ostream JsonOS(JsonFilename.str(), EC);
+ if (EC)
+ report_fatal_error("Error saving kernel json file : " +
+ StringRef(EC.message()));
+ JsonOS << json::Value(std::move(JsonKernelInfo));
+ JsonOS.close();
+ }
+
+ void saveKernelOutputInfo(const char *Name,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) {
+ Twine OutputFilename =
+ Twine(Name) + (isRecording() ? ".original.output" : ".replay.output");
+ dumpDeviceMemory(OutputFilename.str(), AsyncInfoWrapper);
+ }
+
+ void *alloc(uint64_t Size) {
+ assert(MemoryStart && "Expected memory has been pre-allocated");
+ void *Alloc = nullptr;
+ constexpr int ALIGN = 16;
+ // Assumes alignment is a power of 2.
+ int64_t AlignedSize = Size + (ALIGN - 1) & (~(ALIGN - 1));
+ std::lock_guard<std::mutex> LG(AllocationLock);
+ Alloc = MemoryPtr;
+ MemoryPtr = (char *)MemoryPtr + AlignedSize;
+ MemorySize += AlignedSize;
+ return Alloc;
+ }
+
+ Error init(GenericDeviceTy *Device) {
+ this->Device = Device;
+ return preallocateDeviceMemory();
+ }
+
+ void deinit() { Device->free(MemoryStart); }
+
+} RecordReplay;
+
AsyncInfoWrapperTy::~AsyncInfoWrapperTy() {
// If we used a local async info object we want synchronous behavior.
// In that case, and assuming the current status code is OK, we will
DynamicMemorySize = GenericDevice.getDynamicMemorySize();
+ if (RecordReplay.isRecording())
+ RecordReplay.saveImage(Name, Image);
+
return initImpl(GenericDevice, Image);
}
if (EnableMM)
MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
+ if (RecordReplay.isRecordingOrReplaying())
+ if (auto Err = RecordReplay.init(this))
+ return Err;
+
return Plugin::success();
}
delete MemoryManager;
MemoryManager = nullptr;
+ if (RecordReplay.isRecordingOrReplaying())
+ RecordReplay.deinit();
+
return deinitImpl();
}
TargetAllocTy Kind) {
void *Alloc = nullptr;
+ if (RecordReplay.isRecordingOrReplaying())
+ return RecordReplay.alloc(Size);
+
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
}
Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
+ // Free is a noop when recording or replaying.
+ if (RecordReplay.isRecordingOrReplaying())
+ return Plugin::success();
+
int Res;
if (MemoryManager)
Res = MemoryManager->free(TgtPtr);
GenericKernelTy &GenericKernel =
*reinterpret_cast<GenericKernelTy *>(EntryPtr);
+ if (RecordReplay.isRecording())
+ RecordReplay.saveKernelInputInfo(
+ GenericKernel.getName(), ArgPtrs, ArgOffsets, NumArgs, NumTeamsClause,
+ ThreadLimitClause, LoopTripCount, AsyncInfoWrapper);
+
Err =
GenericKernel.launch(*this, ArgPtrs, ArgOffsets, NumArgs, NumTeamsClause,
ThreadLimitClause, LoopTripCount, AsyncInfoWrapper);
+
+ if (RecordReplay.isRecordingOrReplaying() &&
+ RecordReplay.isSaveOutputEnabled())
+ RecordReplay.saveKernelOutputInfo(GenericKernel.getName(),
+ AsyncInfoWrapper);
+
return Err;
}
__tgt_target_kernel;
__tgt_target_kernel_nowait;
__tgt_target_nowait_query;
+ __tgt_target_kernel_replay;
__tgt_mapper_num_components;
__tgt_push_mapper_component;
__kmpc_push_target_tripcount;
HostPtr, Args);
}
+/// Implements a target kernel entry that replays a pre-recorded kernel.
+/// \param Loc Source location associated with this target region (unused).
+/// \param DeviceId The device identifier to execute the target region.
+/// \param HostPtr A pointer to an address that uniquely identifies the kernel.
+/// \param DeviceMemory A pointer to an array storing device memory data to move
+/// prior to kernel execution.
+/// \param DeviceMemorySize The size of the above device memory data in bytes.
+/// \param TgtArgs An array of pointers of the pre-recorded target kernel
+/// arguments.
+/// \param TgtOffsets An array of pointers of the pre-recorded target kernel
+/// argument offsets.
+/// \param NumArgs The number of kernel arguments.
+/// \param NumTeams Number of teams to launch the target region with.
+/// \param ThreadLimit Limit to the number of threads to use in kernel
+/// execution.
+/// \param LoopTripCount The pre-recorded value of the loop tripcount, if any.
+/// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure.
+EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
+ void *HostPtr, void *DeviceMemory,
+ int64_t DeviceMemorySize, void **TgtArgs,
+ ptrdiff_t *TgtOffsets, int32_t NumArgs,
+ int32_t NumTeams, int32_t ThreadLimit,
+ uint64_t LoopTripCount) {
+
+ if (checkDeviceAndCtors(DeviceId, Loc)) {
+ DP("Not offloading to device %" PRId64 "\n", DeviceId);
+ return OMP_TGT_FAIL;
+ }
+ DeviceTy &Device = *PM->Devices[DeviceId];
+
+ AsyncInfoTy AsyncInfo(Device);
+ int Rc = target_replay(Loc, Device, HostPtr, DeviceMemory, DeviceMemorySize,
+ TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit,
+ LoopTripCount, AsyncInfo);
+ if (Rc == OFFLOAD_SUCCESS)
+ Rc = AsyncInfo.synchronize();
+ handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
+ assert(Rc == OFFLOAD_SUCCESS &&
+ "__tgt_target_kernel_replay unexpected failure!");
+ return OMP_TGT_SUCCESS;
+}
+
EXTERN int __tgt_target_kernel_nowait(
ident_t *Loc, int64_t DeviceId, int32_t NumTeams, int32_t ThreadLimit,
void *HostPtr, __tgt_kernel_arguments *Args, int32_t DepNum, void *DepList,
return OFFLOAD_SUCCESS;
}
+
+/// Executes a kernel using pre-recorded information for loading to
+/// device memory to launch the target kernel with the pre-recorded
+/// configuration.
+int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
+ void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs,
+ ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
+ int32_t ThreadLimit, uint64_t LoopTripCount,
+ AsyncInfoTy &AsyncInfo) {
+ int32_t DeviceId = Device.DeviceID;
+ TableMap *TM = getTableMap(HostPtr);
+ // Fail if the table map fails to find the target kernel pointer for the
+ // provided host pointer.
+ if (!TM) {
+ REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
+ DPxPTR(HostPtr));
+ return OFFLOAD_FAIL;
+ }
+
+ // Retrieve the target table of offloading entries.
+ __tgt_target_table *TargetTable = nullptr;
+ {
+ std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
+ assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
+ "Not expecting a device ID outside the table's bounds!");
+ TargetTable = TM->Table->TargetsTable[DeviceId];
+ }
+ assert(TargetTable && "Global data has not been mapped\n");
+
+ // Retrieve the target kernel pointer, allocate and store the recorded device
+ // memory data, and launch device execution.
+ void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
+ DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
+ TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
+
+ void *TgtPtr = Device.allocData(DeviceMemorySize, /* HstPtr */ nullptr,
+ TARGET_ALLOC_DEFAULT);
+ Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
+
+ int Ret =
+ Device.runTeamRegion(TgtEntryPtr, TgtArgs, TgtOffsets, NumArgs, NumTeams,
+ ThreadLimit, LoopTripCount, AsyncInfo);
+
+ if (Ret != OFFLOAD_SUCCESS) {
+ REPORT("Executing target region abort target.\n");
+ return OFFLOAD_FAIL;
+ }
+
+ return OFFLOAD_SUCCESS;
+}
uint64_t Tripcount, int IsTeamConstruct,
AsyncInfoTy &AsyncInfo);
+extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
+ void *DeviceMemory, int64_t DeviceMemorySize,
+ void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
+ int32_t NumTeams, int32_t ThreadLimit,
+ uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo);
+
extern void handleTargetOutcome(bool Success, ident_t *Loc);
extern bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc);
extern void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
endmacro()
add_subdirectory(deviceinfo)
+add_subdirectory(kernelreplay)
--- /dev/null
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build llvm-omp-kernel-replay tool
+#
+##===----------------------------------------------------------------------===##
+
+libomptarget_say("Building the llvm-omp-kernel-replay tool")
+
+add_openmp_tool(llvm-omp-kernel-replay llvm-omp-kernel-replay.cpp)
+
+llvm_update_compile_flags(llvm-omp-kernel-replay)
+
+target_include_directories(llvm-omp-kernel-replay PRIVATE
+ ${LIBOMPTARGET_INCLUDE_DIR}
+)
+target_link_libraries(llvm-omp-kernel-replay PRIVATE
+ LLVMSupport
+ omp
+ omptarget
+)
--- /dev/null
+//===- llvm-omp-kernel-replay.cpp - Replay OpenMP offload kernel ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a command line utility to replay the execution of recorded OpenMP
+// offload kernels.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptargetplugin.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+cl::OptionCategory ReplayOptions("llvm-omp-kernel-replay Options");
+
+// InputFilename - The filename to read the json description of the kernel.
+static cl::opt<std::string> InputFilename(cl::Positional,
+ cl::desc("<input kernel json file>"),
+ cl::Required);
+
+static cl::opt<bool> VerifyOpt(
+ "verify",
+ cl::desc(
+ "Verify device memory post execution against the original output."),
+ cl::init(false), cl::cat(ReplayOptions));
+
+static cl::opt<bool> SaveOutputOpt(
+ "save-output",
+ cl::desc("Save the device memory output of the replayed kernel execution."),
+ cl::init(false), cl::cat(ReplayOptions));
+
+static cl::opt<unsigned> NumTeamsOpt("num-teams",
+ cl::desc("Set the number of teams."),
+ cl::init(0), cl::cat(ReplayOptions));
+
+static cl::opt<unsigned> NumThreadsOpt("num-threads",
+ cl::desc("Set the number of threads."),
+ cl::init(0), cl::cat(ReplayOptions));
+
+static cl::opt<int32_t> DeviceIdOpt("device-id", cl::desc("Set the device id."),
+ cl::init(-1), cl::cat(ReplayOptions));
+
+int main(int argc, char **argv) {
+ cl::HideUnrelatedOptions(ReplayOptions);
+ cl::ParseCommandLineOptions(argc, argv, "llvm-omp-kernel-replay\n");
+
+ ErrorOr<std::unique_ptr<MemoryBuffer>> KernelInfoMB =
+ MemoryBuffer::getFile(InputFilename, /* isText */ true,
+ /* RequiresNullTerminator */ true);
+ if (!KernelInfoMB)
+ report_fatal_error("Error reading the kernel info json file");
+ Expected<json::Value> JsonKernelInfo =
+ json::parse(KernelInfoMB.get()->getBuffer());
+ if (auto Err = JsonKernelInfo.takeError())
+ report_fatal_error("Cannot parse the kernel info json file");
+
+ auto NumTeamsJson =
+ JsonKernelInfo->getAsObject()->getInteger("NumTeamsClause");
+ unsigned NumTeams = (NumTeamsOpt > 0 ? NumTeamsOpt : NumTeamsJson.value());
+ auto NumThreadsJson =
+ JsonKernelInfo->getAsObject()->getInteger("ThreadLimitClause");
+ unsigned NumThreads =
+ (NumThreadsOpt > 0 ? NumThreadsOpt : NumThreadsJson.value());
+ // TODO: Print a warning if number of teams/threads is explicitly set in the
+ // kernel info but overriden through command line options.
+ auto LoopTripCount =
+ JsonKernelInfo->getAsObject()->getInteger("LoopTripCount");
+ auto KernelFunc = JsonKernelInfo->getAsObject()->getString("Name");
+
+ SmallVector<void *> TgtArgs;
+ SmallVector<ptrdiff_t> TgtArgOffsets;
+ auto NumArgs = JsonKernelInfo->getAsObject()->getInteger("NumArgs");
+ auto *TgtArgsArray = JsonKernelInfo->getAsObject()->getArray("ArgPtrs");
+ for (auto It : *TgtArgsArray)
+ TgtArgs.push_back(reinterpret_cast<void *>(It.getAsInteger().value()));
+ auto *TgtArgOffsetsArray =
+ JsonKernelInfo->getAsObject()->getArray("ArgOffsets");
+ for (auto It : *TgtArgOffsetsArray)
+ TgtArgOffsets.push_back(
+ reinterpret_cast<ptrdiff_t>(It.getAsInteger().value()));
+
+ __tgt_offload_entry KernelEntry = {nullptr, nullptr, 0, 0, 0};
+ std::string KernelEntryName = KernelFunc.value().str();
+ KernelEntry.name = const_cast<char *>(KernelEntryName.c_str());
+ // Anything non-zero works to uniquely identify the kernel.
+ KernelEntry.addr = (void *)0x1;
+
+ ErrorOr<std::unique_ptr<MemoryBuffer>> ImageMB =
+ MemoryBuffer::getFile(KernelEntryName + ".image", /* isText */ false,
+ /* RequiresNullTerminator */ false);
+ if (!ImageMB)
+ report_fatal_error("Error reading the kernel image.");
+
+ __tgt_device_image DeviceImage;
+ DeviceImage.ImageStart = (void *)ImageMB.get()->getBufferStart();
+ DeviceImage.ImageEnd = (void *)ImageMB.get()->getBufferEnd();
+ DeviceImage.EntriesBegin = &KernelEntry;
+ DeviceImage.EntriesEnd = &KernelEntry + 1;
+
+ __tgt_bin_desc Desc;
+ Desc.NumDeviceImages = 1;
+ Desc.HostEntriesBegin = &KernelEntry;
+ Desc.HostEntriesEnd = &KernelEntry + 1;
+ Desc.DeviceImages = &DeviceImage;
+
+ ErrorOr<std::unique_ptr<MemoryBuffer>> DeviceMemoryMB =
+ MemoryBuffer::getFile(KernelEntryName + ".memory", /* isText */ false,
+ /* RequiresNullTerminator */ false);
+ if (!DeviceMemoryMB)
+ report_fatal_error("Error reading the kernel input device memory.");
+
+ setenv("LIBOMPTARGET_REPLAY", "1", 1);
+ if (VerifyOpt || SaveOutputOpt)
+ setenv("LIBOMPTARGET_RR_SAVE_OUTPUT", "1", 1);
+
+ auto DeviceMemorySizeJson =
+ JsonKernelInfo->getAsObject()->getInteger("DeviceMemorySize");
+ // Set device memory size to the ceiling of GB granularity.
+ uint64_t DeviceMemorySize =
+ std::ceil(DeviceMemorySizeJson.value() / (1024.0 * 1024.0 * 1024.0));
+
+ setenv("LIBOMPTARGET_RR_DEVMEM_SIZE",
+ std::to_string(DeviceMemorySize).c_str(), 1);
+
+ auto DeviceIdJson = JsonKernelInfo->getAsObject()->getInteger("DeviceId");
+ // TODO: Print warning if the user overrides the device id in the json file.
+ int32_t DeviceId = (DeviceIdOpt > -1 ? DeviceIdOpt : DeviceIdJson.value());
+
+ // TODO: do we need requires?
+ //__tgt_register_requires(/* Flags */1);
+
+ __tgt_init_all_rtls();
+
+ __tgt_register_lib(&Desc);
+
+ __tgt_target_kernel_replay(
+ /* Loc */ nullptr, DeviceId, KernelEntry.addr,
+ (void *)DeviceMemoryMB.get()->getBuffer().data(),
+ DeviceMemoryMB.get()->getBufferSize(), TgtArgs.data(),
+ TgtArgOffsets.data(), NumArgs.value(), NumTeams, NumThreads,
+ LoopTripCount.value());
+
+ if (VerifyOpt) {
+ ErrorOr<std::unique_ptr<MemoryBuffer>> OriginalOutputMB =
+ MemoryBuffer::getFile(KernelEntryName + ".original.output",
+ /* isText */ false,
+ /* RequiresNullTerminator */ false);
+ if (!OriginalOutputMB)
+ report_fatal_error("Error reading the kernel original output file, make "
+ "sure LIBOMPTARGET_SAVE_OUTPUT is set when recording");
+ ErrorOr<std::unique_ptr<MemoryBuffer>> ReplayOutputMB =
+ MemoryBuffer::getFile(KernelEntryName + ".replay.output",
+ /* isText */ false,
+ /* RequiresNullTerminator */ false);
+ if (!ReplayOutputMB)
+ report_fatal_error("Error reading the kernel replay output file");
+
+ StringRef OriginalOutput = OriginalOutputMB.get()->getBuffer();
+ StringRef ReplayOutput = ReplayOutputMB.get()->getBuffer();
+ if (OriginalOutput == ReplayOutput)
+ outs() << "[llvm-omp-kernel-replay] Replay device memory verified!\n";
+ else
+ outs() << "[llvm-omp-kernel-replay] Replay device memory failed to "
+ "verify!\n";
+ }
+ // TODO: calling unregister lib causes plugin deinit error for nextgen
+ // plugins.
+ //__tgt_unregister_lib(&Desc);
+
+ return 0;
+}