From 622627025332af4bfa01057210f315b2278a9439 Mon Sep 17 00:00:00 2001 From: Dhruva Chakrabarti Date: Tue, 28 Sep 2021 22:44:14 -0700 Subject: [PATCH] [libomptarget] [amdgpu] After a kernel dispatch packet is published, its contents must not be accessed. Fixes: SWDEV-275232 (With contributions from Ammar Elwazir, Laurent Morichetti, and Tony Tye) The current code is racy. After the packet is submitted, the GPU will increment the read index. If this wraps around before the memory is read from it'll refer to a signal from an unrelated packet. Change avoids reading from the packet post-submission. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D110679 --- openmp/libomptarget/plugins/amdgpu/src/rtl.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 767a0eb..1d2ed42 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -2185,6 +2185,7 @@ int32_t __tgt_rtl_run_target_team_region_locked( packet->completion_signal = {0}; // may want a pool of signals KernelArgPool *ArgPool = nullptr; + void *kernarg = nullptr; { auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name)); if (it != KernelArgPoolMap.end()) { @@ -2196,7 +2197,6 @@ int32_t __tgt_rtl_run_target_team_region_locked( device_id); } { - void *kernarg = nullptr; if (ArgPool) { assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *))); kernarg = ArgPool->allocate(arg_num); @@ -2240,29 +2240,29 @@ int32_t __tgt_rtl_run_target_team_region_locked( packet->kernarg_address = kernarg; } - { - hsa_signal_t s = DeviceInfo.FreeSignalPool.pop(); - if (s.handle == 0) { - DP("Failed to get signal instance\n"); - return OFFLOAD_FAIL; - } - packet->completion_signal = s; - hsa_signal_store_relaxed(packet->completion_signal, 1); + hsa_signal_t s = DeviceInfo.FreeSignalPool.pop(); + if (s.handle == 0) { + DP("Failed to get signal instance\n"); + return OFFLOAD_FAIL; } + packet->completion_signal = s; + hsa_signal_store_relaxed(packet->completion_signal, 1); + // Publish the packet indicating it is ready to be processed core::packet_store_release(reinterpret_cast(packet), core::create_header(), packet->setup); + // Since the packet is already published, its contents must not be + // accessed any more hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); - while (hsa_signal_wait_scacquire(packet->completion_signal, - HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, + while (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) ; assert(ArgPool); - ArgPool->deallocate(packet->kernarg_address); - DeviceInfo.FreeSignalPool.push(packet->completion_signal); + ArgPool->deallocate(kernarg); + DeviceInfo.FreeSignalPool.push(s); } DP("Kernel completed\n"); -- 2.7.4