From 14ff611fe12f84324febbf94cb1d93de7a5eb95d Mon Sep 17 00:00:00 2001 From: Jon Chesterfield Date: Wed, 8 Dec 2021 08:23:12 +0000 Subject: [PATCH] Revert "[OpenMP][AMDGPU] Switch host-device memory copy to asynchronous version" This reverts commit 6de698bf10996b532632bb9dfa9fd420c5af62af. It didn't build in the dynamic_hsa configuration --- openmp/libomptarget/plugins/amdgpu/impl/impl.cpp | 116 ++++++++------------- .../plugins/amdgpu/impl/impl_runtime.h | 9 +- openmp/libomptarget/plugins/amdgpu/src/rtl.cpp | 15 +-- 3 files changed, 54 insertions(+), 86 deletions(-) diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp index d41ca97..7f841aa 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp @@ -5,6 +5,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +#include "hsa_api.h" +#include "impl_runtime.h" +#include "internal.h" #include "rt.h" #include @@ -12,27 +15,29 @@ * Data */ -// host pointer (either src or dest) must be locked via hsa_amd_memory_lock -static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest, - hsa_agent_t agent, const void *src, - size_t size) { +static hsa_status_t invoke_hsa_copy(hsa_signal_t sig, void *dest, + const void *src, size_t size, + hsa_agent_t agent) { const hsa_signal_value_t init = 1; const hsa_signal_value_t success = 0; - hsa_signal_store_screlease(signal, init); + hsa_signal_store_screlease(sig, init); - hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, - nullptr, signal); - if (err != HSA_STATUS_SUCCESS) + hsa_status_t err = + hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, NULL, sig); + if (err != HSA_STATUS_SUCCESS) { return err; + } // async_copy reports success by decrementing and failure by setting to < 0 hsa_signal_value_t got = init; - while (got == init) - got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init, + while (got == init) { + got = hsa_signal_wait_scacquire(sig, HSA_SIGNAL_CONDITION_NE, init, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + } - if (got != success) + if (got != success) { return HSA_STATUS_ERROR; + } return err; } @@ -43,58 +48,19 @@ struct implFreePtrDeletor { } }; -enum CopyDirection { H2D, D2H }; - -static hsa_status_t locking_async_memcpy(enum CopyDirection direction, - hsa_signal_t signal, void *dest, - hsa_agent_t agent, void *src, - void *lockingPtr, size_t size) { - hsa_status_t err; - - void *lockedPtr = nullptr; - err = hsa_amd_memory_lock(lockingPtr, size, nullptr, 0, (void **)&lockedPtr); - if (err != HSA_STATUS_SUCCESS) - return err; - - switch (direction) { - case H2D: - err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size); - break; - case D2H: - err = invoke_hsa_copy(signal, lockedPtr, agent, src, size); - break; - default: - err = HSA_STATUS_ERROR; // fall into unlock before returning - } - - if (err != HSA_STATUS_SUCCESS) { - // do not leak locked host pointers, but discard potential error message - hsa_amd_memory_unlock(lockingPtr); - return err; - } - - err = hsa_amd_memory_unlock(lockingPtr); - if (err != HSA_STATUS_SUCCESS) - return err; - - return HSA_STATUS_SUCCESS; -} - hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest, - void *hostSrc, size_t size, - hsa_agent_t device_agent, + const void *hostSrc, size_t size, + hsa_agent_t agent, hsa_amd_memory_pool_t MemoryPool) { - hsa_status_t err; - - err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest, - device_agent, hostSrc, hostSrc, size); - - if (err == HSA_STATUS_SUCCESS) - return err; + hsa_status_t rc = hsa_memory_copy(deviceDest, hostSrc, size); - // async memcpy sometimes fails in situations where + // hsa_memory_copy sometimes fails in situations where // allocate + copy succeeds. Looks like it might be related to // locking part of a read only segment. Fall back for now. + if (rc == HSA_STATUS_SUCCESS) { + return HSA_STATUS_SUCCESS; + } + void *tempHostPtr; hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool); if (ret != HSA_STATUS_SUCCESS) { @@ -104,26 +70,26 @@ hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest, std::unique_ptr del(tempHostPtr); memcpy(tempHostPtr, hostSrc, size); - return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest, - device_agent, tempHostPtr, tempHostPtr, size); + if (invoke_hsa_copy(signal, deviceDest, tempHostPtr, size, agent) != + HSA_STATUS_SUCCESS) { + return HSA_STATUS_ERROR; + } + return HSA_STATUS_SUCCESS; } -hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest, - void *deviceSrc, size_t size, - hsa_agent_t deviceAgent, +hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *dest, + const void *deviceSrc, size_t size, + hsa_agent_t agent, hsa_amd_memory_pool_t MemoryPool) { - hsa_status_t err; - - // device has always visibility over both pointers, so use that - err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent, - deviceSrc, hostDest, size); - - if (err == HSA_STATUS_SUCCESS) - return err; + hsa_status_t rc = hsa_memory_copy(dest, deviceSrc, size); // hsa_memory_copy sometimes fails in situations where // allocate + copy succeeds. Looks like it might be related to // locking part of a read only segment. Fall back for now. + if (rc == HSA_STATUS_SUCCESS) { + return HSA_STATUS_SUCCESS; + } + void *tempHostPtr; hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool); if (ret != HSA_STATUS_SUCCESS) { @@ -132,11 +98,11 @@ hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest, } std::unique_ptr del(tempHostPtr); - err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr, - deviceAgent, deviceSrc, tempHostPtr, size); - if (err != HSA_STATUS_SUCCESS) + if (invoke_hsa_copy(signal, tempHostPtr, deviceSrc, size, agent) != + HSA_STATUS_SUCCESS) { return HSA_STATUS_ERROR; + } - memcpy(hostDest, tempHostPtr, size); + memcpy(dest, tempHostPtr, size); return HSA_STATUS_SUCCESS; } diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h index c99b3e7..52efaff 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h @@ -19,12 +19,13 @@ hsa_status_t impl_module_register_from_memory_to_place( void *cb_state); hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest, - void *hostSrc, size_t size, - hsa_agent_t device_agent, + const void *hostSrc, size_t size, + hsa_agent_t agent, hsa_amd_memory_pool_t MemoryPool); -hsa_status_t impl_memcpy_d2h(hsa_signal_t sig, void *hostDest, void *deviceSrc, - size_t size, hsa_agent_t device_agent, +hsa_status_t impl_memcpy_d2h(hsa_signal_t sig, void *hostDest, + const void *deviceSrc, size_t size, + hsa_agent_t agent, hsa_amd_memory_pool_t MemoryPool); } diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 2da012d..5434692 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -464,9 +464,10 @@ public: ""); static const int Default_WG_Size = getGridValue<64>().GV_Default_WG_Size; - using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t size, - hsa_agent_t, hsa_amd_memory_pool_t); - hsa_status_t freesignalpool_memcpy(void *dest, void *src, size_t size, + using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, const void *, + size_t size, hsa_agent_t, + hsa_amd_memory_pool_t); + hsa_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size, MemcpyFunc Func, int32_t deviceId) { hsa_agent_t agent = HSAAgents[deviceId]; hsa_signal_t s = FreeSignalPool.pop(); @@ -478,13 +479,13 @@ public: return r; } - hsa_status_t freesignalpool_memcpy_d2h(void *dest, void *src, size_t size, - int32_t deviceId) { + hsa_status_t freesignalpool_memcpy_d2h(void *dest, const void *src, + size_t size, int32_t deviceId) { return freesignalpool_memcpy(dest, src, size, impl_memcpy_d2h, deviceId); } - hsa_status_t freesignalpool_memcpy_h2d(void *dest, void *src, size_t size, - int32_t deviceId) { + hsa_status_t freesignalpool_memcpy_h2d(void *dest, const void *src, + size_t size, int32_t deviceId) { return freesignalpool_memcpy(dest, src, size, impl_memcpy_h2d, deviceId); } -- 2.7.4