[libomptarget] [amdgpu] After a kernel dispatch packet is published, its contents...
authorDhruva Chakrabarti <Dhruva.Chakrabarti@amd.com>
Wed, 29 Sep 2021 05:44:14 +0000 (22:44 -0700)
committerDhruva Chakrabarti <Dhruva.Chakrabarti@amd.com>
Wed, 29 Sep 2021 16:22:07 +0000 (09:22 -0700)
Fixes: SWDEV-275232 (With contributions from Ammar Elwazir, Laurent Morichetti, and Tony Tye)

The current code is racy. After the packet is submitted, the GPU will increment the read index. If this wraps around before the memory is read from it'll refer to a signal from an unrelated packet. Change avoids reading from the packet post-submission.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D110679

openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

index 767a0eb..1d2ed42 100644 (file)
@@ -2185,6 +2185,7 @@ int32_t __tgt_rtl_run_target_team_region_locked(
     packet->completion_signal = {0}; // may want a pool of signals
 
     KernelArgPool *ArgPool = nullptr;
+    void *kernarg = nullptr;
     {
       auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name));
       if (it != KernelArgPoolMap.end()) {
@@ -2196,7 +2197,6 @@ int32_t __tgt_rtl_run_target_team_region_locked(
          device_id);
     }
     {
-      void *kernarg = nullptr;
       if (ArgPool) {
         assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *)));
         kernarg = ArgPool->allocate(arg_num);
@@ -2240,29 +2240,29 @@ int32_t __tgt_rtl_run_target_team_region_locked(
       packet->kernarg_address = kernarg;
     }
 
-    {
-      hsa_signal_t s = DeviceInfo.FreeSignalPool.pop();
-      if (s.handle == 0) {
-        DP("Failed to get signal instance\n");
-        return OFFLOAD_FAIL;
-      }
-      packet->completion_signal = s;
-      hsa_signal_store_relaxed(packet->completion_signal, 1);
+    hsa_signal_t s = DeviceInfo.FreeSignalPool.pop();
+    if (s.handle == 0) {
+      DP("Failed to get signal instance\n");
+      return OFFLOAD_FAIL;
     }
+    packet->completion_signal = s;
+    hsa_signal_store_relaxed(packet->completion_signal, 1);
 
+    // Publish the packet indicating it is ready to be processed
     core::packet_store_release(reinterpret_cast<uint32_t *>(packet),
                                core::create_header(), packet->setup);
 
+    // Since the packet is already published, its contents must not be
+    // accessed any more
     hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
 
-    while (hsa_signal_wait_scacquire(packet->completion_signal,
-                                     HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
+    while (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
                                      HSA_WAIT_STATE_BLOCKED) != 0)
       ;
 
     assert(ArgPool);
-    ArgPool->deallocate(packet->kernarg_address);
-    DeviceInfo.FreeSignalPool.push(packet->completion_signal);
+    ArgPool->deallocate(kernarg);
+    DeviceInfo.FreeSignalPool.push(s);
   }
 
   DP("Kernel completed\n");