The current only way to obtain pinned memory with libomptarget is to use a custom allocator llvm_omp_target_alloc_host.
This reflects well the CUDA implementation of libomptarget, but it does not correctly expose the AMDGPU runtime API,
where any system allocated page can be locked/unlocked through a call to hsa_amd_memory_lock/unlock.
This patch enables users to allocate memory through malloc (mmap, sbreak) and then pin the related memory pages
with a libomptarget special call. It is a base support in the amdgpu libomptarget plugin to enable users to prelock
their host memory pages so that the runtime doesn't need to lock them itself for asynchronous memory transfers.
Reviewed By: jdoerfert, ye-luo
Differential Revision: https://reviews.llvm.org/D139208
int32_t __tgt_rtl_init_device_info(int32_t ID, __tgt_device_info *DeviceInfoPtr,
const char **ErrStr);
+// lock/pin host memory
+int32_t __tgt_rtl_data_lock(int32_t ID, void *HstPtr, int64_t Size,
+ void **LockedPtr);
+
+// unlock/unpin host memory
+int32_t __tgt_rtl_data_unlock(int32_t ID, void *HstPtr);
+
#ifdef __cplusplus
}
#endif
typedef int32_t(init_async_info_ty)(int32_t, __tgt_async_info **);
typedef int64_t(init_device_into_ty)(int64_t, __tgt_device_info *,
const char **);
+ typedef int32_t(data_lock_ty)(int32_t, void *, int64_t, void **);
+ typedef int32_t(data_unlock_ty)(int32_t, void *);
int32_t Idx = -1; // RTL index, index is the number of devices
// of other RTLs that were registered before,
init_async_info_ty *init_async_info = nullptr;
init_device_into_ty *init_device_info = nullptr;
release_async_info_ty *release_async_info = nullptr;
+ data_lock_ty *data_lock = nullptr;
+ data_unlock_ty *data_unlock = nullptr;
// Are there images associated with this RTL.
bool IsUsed = false;
* Data
*/
+hsa_status_t is_locked(void *ptr, void **agentBaseAddress) {
+ hsa_status_t err = HSA_STATUS_SUCCESS;
+ hsa_amd_pointer_info_t info;
+ info.size = sizeof(hsa_amd_pointer_info_t);
+ err = hsa_amd_pointer_info(ptr, &info, /*alloc=*/nullptr,
+ /*num_agents_accessible=*/nullptr,
+ /*accessible=*/nullptr);
+ if (err != HSA_STATUS_SUCCESS) {
+ DP("Error when getting pointer info\n");
+ return err;
+ }
+
+ if (info.type == HSA_EXT_POINTER_TYPE_LOCKED) {
+ // When user passes in a basePtr+offset we need to fix the
+ // locked pointer to include the offset: ROCr always returns
+ // the base locked address, not the shifted one.
+ if ((char *)info.hostBaseAddress <= (char *)ptr &&
+ (char *)ptr < (char *)info.hostBaseAddress + info.sizeInBytes)
+ *agentBaseAddress =
+ (void *)((uint64_t)info.agentBaseAddress + (uint64_t)ptr -
+ (uint64_t)info.hostBaseAddress);
+ else // address is already device-agent accessible, no need to compute
+ // offset
+ *agentBaseAddress = ptr;
+ } else
+ *agentBaseAddress = nullptr;
+
+ return HSA_STATUS_SUCCESS;
+}
+
// host pointer (either src or dest) must be locked via hsa_amd_memory_lock
static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest,
hsa_agent_t agent, const void *src,
hsa_signal_t signal, void *dest,
hsa_agent_t agent, void *src,
void *lockingPtr, size_t size) {
- hsa_status_t err;
-
void *lockedPtr = nullptr;
- err = hsa_amd_memory_lock(lockingPtr, size, nullptr, 0, (void **)&lockedPtr);
+ hsa_status_t err = is_locked(lockingPtr, &lockedPtr);
+ bool HostPtrIsLocked = true;
if (err != HSA_STATUS_SUCCESS)
return err;
+ if (!lockedPtr) { // not locked
+ HostPtrIsLocked = false;
+ hsa_agent_t agents[1] = {agent};
+ err = hsa_amd_memory_lock(lockingPtr, size, agents, /*num_agent=*/1,
+ (void **)&lockedPtr);
+ if (err != HSA_STATUS_SUCCESS)
+ return err;
+ DP("locking_async_memcpy: lockingPtr=%p lockedPtr=%p Size = %lu\n",
+ lockingPtr, lockedPtr, size);
+ }
switch (direction) {
case H2D:
break;
}
- if (err != HSA_STATUS_SUCCESS) {
+ if (err != HSA_STATUS_SUCCESS && !HostPtrIsLocked) {
// do not leak locked host pointers, but discard potential error message
+ // because the initial error was in the copy function
hsa_amd_memory_unlock(lockingPtr);
return err;
}
- err = hsa_amd_memory_unlock(lockingPtr);
+ // unlock only if not user locked
+ if (!HostPtrIsLocked)
+ err = hsa_amd_memory_unlock(lockingPtr);
if (err != HSA_STATUS_SUCCESS)
return err;
extern "C" {
+// Check if pointer ptr is already locked
+hsa_status_t is_locked(void *ptr, void **agentBaseAddress);
+
hsa_status_t impl_module_register_from_memory_to_place(
void *module_bytes, size_t module_size, int DeviceId,
hsa_status_t (*on_deserialized_data)(void *data, size_t size,
return (Rc == 0) && (SI.Addr != nullptr);
}
+hsa_status_t lock_memory(void *HostPtr, size_t Size, hsa_agent_t Agent,
+ void **LockedHostPtr) {
+ hsa_status_t err = is_locked(HostPtr, LockedHostPtr);
+ if (err != HSA_STATUS_SUCCESS)
+ return err;
+
+ // HostPtr is already locked, just return it
+ if (*LockedHostPtr)
+ return HSA_STATUS_SUCCESS;
+
+ hsa_agent_t Agents[1] = {Agent};
+ return hsa_amd_memory_lock(HostPtr, Size, Agents, /*num_agent=*/1,
+ LockedHostPtr);
+}
+
+hsa_status_t unlock_memory(void *HostPtr) {
+ void *LockedHostPtr = nullptr;
+ hsa_status_t err = is_locked(HostPtr, &LockedHostPtr);
+ if (err != HSA_STATUS_SUCCESS)
+ return err;
+
+ // if LockedHostPtr is nullptr, then HostPtr was not locked
+ if (!LockedHostPtr)
+ return HSA_STATUS_SUCCESS;
+
+ err = hsa_amd_memory_unlock(HostPtr);
+ return err;
+}
+
} // namespace
namespace core {
DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]);
}
+int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *HostPtr, int64_t Size,
+ void **LockedHostPtr) {
+ assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
+
+ hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
+ hsa_status_t err = lock_memory(HostPtr, Size, Agent, LockedHostPtr);
+ if (err != HSA_STATUS_SUCCESS) {
+ DP("Error in tgt_rtl_data_lock\n");
+ return OFFLOAD_FAIL;
+ }
+ DP("Tgt lock host data %ld bytes, (HostPtr:%016llx).\n", Size,
+ (long long unsigned)(Elf64_Addr)*LockedHostPtr);
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_unlock(int DeviceId, void *HostPtr) {
+ assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
+ hsa_status_t err = unlock_memory(HostPtr);
+ if (err != HSA_STATUS_SUCCESS) {
+ DP("Error in tgt_rtl_data_unlock\n");
+ return OFFLOAD_FAIL;
+ }
+
+ DP("Tgt unlock data (tgt:%016llx).\n",
+ (long long unsigned)(Elf64_Addr)HostPtr);
+ return OFFLOAD_SUCCESS;
+}
+
} // extern "C"
EXTERN void *llvm_omp_target_dynamic_shared_alloc() { return nullptr; }
EXTERN void *llvm_omp_get_dynamic_shared() { return nullptr; }
+EXTERN [[nodiscard]] void *llvm_omp_target_lock_mem(void *Ptr, size_t Size,
+ int DeviceNum) {
+ return targetLockExplicit(Ptr, Size, DeviceNum, __func__);
+}
+
+EXTERN void llvm_omp_target_unlock_mem(void *Ptr, int DeviceNum) {
+ targetUnlockExplicit(Ptr, DeviceNum, __func__);
+}
+
EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
TIMESCOPE();
DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n",
llvm_omp_target_free_shared;
llvm_omp_target_free_device;
llvm_omp_target_dynamic_shared_alloc;
+ llvm_omp_target_lock_mem;
+ llvm_omp_target_unlock_mem;
__tgt_set_info_flag;
__tgt_print_device_info;
omp_get_interop_ptr;
DP("omp_target_free deallocated device ptr\n");
}
+void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
+ const char *Name) {
+ TIMESCOPE();
+ DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
+
+ if (Size <= 0) {
+ DP("Call to %s with non-positive length\n", Name);
+ return NULL;
+ }
+
+ void *rc = NULL;
+
+ if (!deviceIsReady(DeviceNum)) {
+ DP("%s returns NULL ptr\n", Name);
+ return NULL;
+ }
+
+ DeviceTy *DevicePtr = nullptr;
+ {
+ std::lock_guard<decltype(PM->RTLsMtx)> LG(PM->RTLsMtx);
+
+ if (!PM->Devices[DeviceNum]) {
+ DP("%s returns, device %d not available\n", Name, DeviceNum);
+ return nullptr;
+ }
+
+ DevicePtr = PM->Devices[DeviceNum].get();
+ }
+
+ int32_t err = 0;
+ if (DevicePtr->RTL->data_lock) {
+ err = DevicePtr->RTL->data_lock(DeviceNum, HostPtr, Size, &rc);
+ if (err) {
+ DP("Could not lock ptr %p\n", HostPtr);
+ return nullptr;
+ }
+ }
+ DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(rc));
+ return rc;
+}
+
+void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
+ TIMESCOPE();
+ DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
+
+ DeviceTy *DevicePtr = nullptr;
+ {
+ std::lock_guard<decltype(PM->RTLsMtx)> LG(PM->RTLsMtx);
+
+ // Don't check deviceIsReady as it can initialize the device if needed.
+ // Just check if DeviceNum exists as targetUnlockExplicit can be called
+ // during process exit/free (and it may have been already destroyed) and
+ // targetAllocExplicit will have already checked deviceIsReady anyway.
+ size_t DevicesSize = PM->Devices.size();
+
+ if (DevicesSize <= (size_t)DeviceNum) {
+ DP("Device ID %d does not have a matching RTL\n", DeviceNum);
+ return;
+ }
+
+ if (!PM->Devices[DeviceNum]) {
+ DP("%s returns, device %d not available\n", Name, DeviceNum);
+ return;
+ }
+
+ DevicePtr = PM->Devices[DeviceNum].get();
+ } // unlock RTLsMtx
+
+ if (DevicePtr->RTL->data_unlock)
+ DevicePtr->RTL->data_unlock(DeviceNum, HostPtr);
+
+ DP("%s returns\n", Name);
+}
+
/// Call the user-defined mapper function followed by the appropriate
// targetData* function (targetData{Begin,End,Update}).
int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
const char *Name);
extern void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
const char *Name);
+extern void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
+ const char *Name);
+extern void targetUnlockExplicit(void *HostPtr, int DeviceNum,
+ const char *Name);
// This structure stores information of a mapped memory region.
struct MapComponentInfoTy {
DynLibrary->getAddressOfSymbol("__tgt_rtl_init_async_info");
*((void **)&RTL.init_device_info) =
DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device_info");
+ *((void **)&RTL.data_lock) =
+ DynLibrary->getAddressOfSymbol("__tgt_rtl_data_lock");
+ *((void **)&RTL.data_unlock) =
+ DynLibrary->getAddressOfSymbol("__tgt_rtl_data_unlock");
RTL.LibraryHandler = std::move(DynLibrary);
--- /dev/null
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <cstdio>
+
+#include <omp.h>
+
+extern "C" {
+void *llvm_omp_target_lock_mem(void *ptr, size_t size, int device_num);
+void llvm_omp_target_unlock_mem(void *ptr, int device_num);
+}
+
+int main() {
+ int n = 100;
+ int *unlocked = new int[n];
+
+ for (int i = 0; i < n; i++)
+ unlocked[i] = i;
+
+ int *locked = (int *)llvm_omp_target_lock_mem(unlocked, n * sizeof(int),
+ omp_get_default_device());
+ if (!locked)
+ return 0;
+
+#pragma omp target teams distribute parallel for map(tofrom : unlocked[ : n])
+ for (int i = 0; i < n; i++)
+ unlocked[i] += 1;
+
+#pragma omp target teams distribute parallel for map(tofrom : unlocked[10 : 10])
+ for (int i = 10; i < 20; i++)
+ unlocked[i] += 1;
+
+#pragma omp target teams distribute parallel for map(tofrom : locked[ : n])
+ for (int i = 0; i < n; i++)
+ locked[i] += 1;
+
+#pragma omp target teams distribute parallel for map(tofrom : locked[10 : 10])
+ for (int i = 10; i < 20; i++)
+ locked[i] += 1;
+
+ llvm_omp_target_unlock_mem(unlocked, omp_get_default_device());
+
+ int err = 0;
+ for (int i = 0; i < n; i++) {
+ if (i < 10 || i > 19) {
+ if (unlocked[i] != i + 2) {
+ printf("Err at %d, got %d, expected %d\n", i, unlocked[i], i + 1);
+ err++;
+ }
+ } else if (unlocked[i] != i + 4) {
+ printf("Err at %d, got %d, expected %d\n", i, unlocked[i], i + 2);
+ err++;
+ }
+ }
+
+ // CHECK: PASS
+ if (err == 0)
+ printf("PASS\n");
+
+ return err;
+}
kmp_target_alloc_host && kmp_target_alloc_shared &&
kmp_target_alloc_device && kmp_target_free_host &&
kmp_target_free_shared && kmp_target_free_device;
+ // lock/pin and unlock/unpin target calls
+ *(void **)(&kmp_target_lock_mem) = KMP_DLSYM("llvm_omp_target_lock_mem");
+ *(void **)(&kmp_target_unlock_mem) = KMP_DLSYM("llvm_omp_target_unlock_mem");
}
omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,