From aa616efbb34e6321c0f24f61e017efdcf398ba04 Mon Sep 17 00:00:00 2001 From: Pushpinder Singh Date: Tue, 20 Oct 2020 05:09:29 -0400 Subject: [PATCH] [libomptarget][AMDGPU][NFC] Split atmi_memcpy for h2d and d2h The calls to atmi_memcpy presently determine the direction of copy (host to device or device to host) by storing pointers in a map during malloc and looking up the pointers during memcpy. As each call site already knows the direction, this stash+lookup can be eliminated. This NFC will be followed by a functional one that deletes those map lookups. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D89776 Change-Id: I1d9089bc1e56b3a9a30e334735fa07dee1f84990 --- openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp | 19 ++++++++ .../plugins/amdgpu/impl/atmi_runtime.h | 50 ++-------------------- openmp/libomptarget/plugins/amdgpu/src/rtl.cpp | 31 ++++++++++---- 3 files changed, 45 insertions(+), 55 deletions(-) diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp b/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp index 6a6bc8c..0586cd3 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp @@ -35,9 +35,28 @@ atmi_status_t atmi_module_register_from_memory_to_place( */ atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src, size_t size) { + hsa_status_t rc = hsa_memory_copy(dest, src, size); + + // hsa_memory_copy sometimes fails in situations where + // allocate + copy succeeds. Looks like it might be related to + // locking part of a read only segment. Fall back for now. + if (rc == HSA_STATUS_SUCCESS) { + return ATMI_STATUS_SUCCESS; + } + return core::Runtime::Memcpy(sig, dest, src, size); } +atmi_status_t atmi_memcpy_h2d(hsa_signal_t sig, void *device_dest, + const void *host_src, size_t size) { + return atmi_memcpy(sig, device_dest, host_src, size); +} + +atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *host_dest, + const void *device_src, size_t size) { + return atmi_memcpy(sig, host_dest, device_src, size); +} + atmi_status_t atmi_free(void *ptr) { return core::Runtime::Memfree(ptr); } atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place) { diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h index 8ee78df..a935b6a 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h @@ -155,53 +155,11 @@ atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place); */ atmi_status_t atmi_free(void *ptr); -/** - * @brief Syncrhonously copy memory from the source to destination memory - * locations. - * - * @detail This function assumes that the source and destination regions are - * non-overlapping. The runtime determines the memory place of the source and - * the - * destination and executes the appropriate optimized data movement methodology. - * - * @param[in] dest The destination pointer previously allocated by a system - * allocator or @p atmi_malloc. - * - * @param[in] src The source pointer previously allocated by a system - * allocator or @p atmi_malloc. - * - * @param[in] size The size of the data to be copied in bytes. - * - * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully. - * - * @retval ::ATMI_STATUS_ERROR The function encountered errors. - * - * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors. - * - */ -atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src, - size_t size); +atmi_status_t atmi_memcpy_h2d(hsa_signal_t sig, void *device_dest, + const void *host_src, size_t size); -static inline atmi_status_t atmi_memcpy_no_signal(void *dest, const void *src, - size_t size) { - hsa_signal_t sig; - hsa_status_t err = hsa_signal_create(0, 0, NULL, &sig); - if (err != HSA_STATUS_SUCCESS) { - return ATMI_STATUS_ERROR; - } - - atmi_status_t r = atmi_memcpy(sig, dest, src, size); - hsa_status_t rc = hsa_signal_destroy(sig); - - if (r != ATMI_STATUS_SUCCESS) { - return r; - } - if (rc != HSA_STATUS_SUCCESS) { - return ATMI_STATUS_ERROR; - } - - return ATMI_STATUS_SUCCESS; -} +atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *host_dest, + const void *device_src, size_t size); /** @} */ diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index e0509a5..8fd7b7e 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -335,17 +335,29 @@ public: static const int Default_WG_Size = llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size]; - atmi_status_t freesignalpool_memcpy(void *dest, const void *src, - size_t size) { + using MemcpyFunc = atmi_status_t(hsa_signal_t, void *, const void *, + size_t size); + atmi_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size, + MemcpyFunc Func) { hsa_signal_t s = FreeSignalPool.pop(); if (s.handle == 0) { return ATMI_STATUS_ERROR; } - atmi_status_t r = atmi_memcpy(s, dest, src, size); + atmi_status_t r = Func(s, dest, src, size); FreeSignalPool.push(s); return r; } + atmi_status_t freesignalpool_memcpy_d2h(void *dest, const void *src, + size_t size) { + return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h); + } + + atmi_status_t freesignalpool_memcpy_h2d(void *dest, const void *src, + size_t size) { + return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d); + } + // Record entry point associated with device void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { assert(device_id < (int32_t)FuncGblEntries.size() && @@ -538,7 +550,7 @@ int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, (long long unsigned)(Elf64_Addr)TgtPtr, (long long unsigned)(Elf64_Addr)HstPtr); - err = DeviceInfo.freesignalpool_memcpy(HstPtr, TgtPtr, (size_t)Size); + err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size); if (err != ATMI_STATUS_SUCCESS) { DP("Error when copying data from device to host. Pointers: " @@ -564,7 +576,7 @@ int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size, (long long unsigned)(Elf64_Addr)HstPtr, (long long unsigned)(Elf64_Addr)TgtPtr); - err = DeviceInfo.freesignalpool_memcpy(TgtPtr, HstPtr, (size_t)Size); + err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size); if (err != ATMI_STATUS_SUCCESS) { DP("Error when copying data from host to device. Pointers: " "host = 0x%016lx, device = 0x%016lx, size = %lld\n", @@ -1021,7 +1033,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, } // write ptr to device memory so it can be used by later kernels - err = DeviceInfo.freesignalpool_memcpy(state_ptr, &ptr, sizeof(void *)); + err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, sizeof(void *)); if (err != ATMI_STATUS_SUCCESS) { fprintf(stderr, "memcpy install of state_ptr failed\n"); return NULL; @@ -1090,7 +1102,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, // If unified memory is present any target link variables // can access host addresses directly. There is no longer a // need for device copies. - err = DeviceInfo.freesignalpool_memcpy(varptr, e->addr, sizeof(void *)); + err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr, + sizeof(void *)); if (err != ATMI_STATUS_SUCCESS) DP("Error when copying USM\n"); DP("Copy linked variable host address (" DPxMOD ")" @@ -1518,8 +1531,8 @@ static void *AllocateNestedParallelCallMemory(int MaxParLevel, int NumGroups, void *TgtPtr = NULL; atmi_status_t err = atmi_malloc(&TgtPtr, NestedMemSize, get_gpu_mem_place(device_id)); - err = - DeviceInfo.freesignalpool_memcpy(CallStackAddr, &TgtPtr, sizeof(void *)); + err = DeviceInfo.freesignalpool_memcpy_h2d(CallStackAddr, &TgtPtr, + sizeof(void *)); if (print_kernel_trace > 2) fprintf(stderr, "CallSck %lx TgtPtr %lx *TgtPtr %lx \n", (long)CallStackAddr, (long)&TgtPtr, (long)TgtPtr); -- 2.7.4