From f5f329a37175c3a30e9d8b20b95fced96b9dde8c Mon Sep 17 00:00:00 2001
From: Pushpinder Singh <Pushpinder.Singh@amd.com>
Date: Thu, 3 Jun 2021 08:58:10 +0000
Subject: [PATCH] [AMDGPU][Libomptarget] Rework logic for locating kernarg
 pools

Previous logic was to always use the first kernarg pool found to allocate
kernel args. This patch changes this to use only the kernarg pool which
has non-zero size. This logic is also reworked to not use any globals.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D103600
---
 openmp/libomptarget/plugins/amdgpu/impl/internal.h |  2 -
 openmp/libomptarget/plugins/amdgpu/impl/system.cpp |  6 --
 openmp/libomptarget/plugins/amdgpu/src/rtl.cpp     | 94 ++++++++++++++++++++--
 3 files changed, 86 insertions(+), 16 deletions(-)
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
index 7a8f0fd..0cdd5ee 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
@@ -181,8 +181,6 @@ private:
   };
 };
 
-extern std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
-
 namespace core {
 hsa_status_t atl_init_gpu_context();
 
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
index 85e79dc..9377e3a 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
@@ -142,8 +142,6 @@ static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = {
 
 ATLMachine g_atl_machine;
 
-std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
-
 /*
    atlc is all internal global values.
    The structure atl_context_t is defined in atl_internal.h
@@ -198,10 +196,6 @@ static hsa_status_t get_memory_pool_info(hsa_amd_memory_pool_t memory_pool,
     if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) {
       ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED);
       proc->addMemory(new_mem);
-      if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & global_flag) {
-        DEBUG_PRINT("GPU kernel args pool handle: %lu\n", memory_pool.handle);
-        atl_gpu_kernarg_pools.push_back(memory_pool);
-      }
     } else {
       ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_COARSE_GRAINED);
       proc->addMemory(new_mem);
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 344c1be..89345b6 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -37,6 +37,7 @@
 
 #include "Debug.h"
 #include "get_elf_mach_gfx_name.h"
+#include "machine.h"
 #include "omptargetplugin.h"
 #include "print_tracing.h"
 
@@ -136,15 +137,15 @@ public:
   KernelArgPool(const KernelArgPool &) = delete;
   KernelArgPool(KernelArgPool &&) = delete;
 
-  KernelArgPool(uint32_t kernarg_segment_size)
+  KernelArgPool(uint32_t kernarg_segment_size,
+                hsa_amd_memory_pool_t &memory_pool)
       : kernarg_segment_size(kernarg_segment_size) {
 
     // atmi uses one pool per kernel for all gpus, with a fixed upper size
     // preserving that exact scheme here, including the queue<int>
 
     hsa_status_t err = hsa_amd_memory_pool_allocate(
-        atl_gpu_kernarg_pools[0],
-        kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0,
+        memory_pool, kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0,
         &kernarg_region);
 
     if (err != HSA_STATUS_SUCCESS) {
@@ -224,7 +225,8 @@ struct KernelTy {
 
   KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id,
            void *_CallStackAddr, const char *_Name,
-           uint32_t _kernarg_segment_size)
+           uint32_t _kernarg_segment_size,
+           hsa_amd_memory_pool_t &KernArgMemoryPool)
       : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize),
         device_id(_device_id), CallStackAddr(_CallStackAddr), Name(_Name) {
     DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
@@ -232,8 +234,8 @@ struct KernelTy {
     std::string N(_Name);
     if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
       KernelArgPoolMap.insert(
-          std::make_pair(N, std::unique_ptr<KernelArgPool>(
-                                new KernelArgPool(_kernarg_segment_size))));
+          std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
+                                _kernarg_segment_size, KernArgMemoryPool))));
     }
   }
 };
@@ -297,6 +299,74 @@ uint16_t create_header() {
   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
   return header;
 }
+
+hsa_status_t addKernArgPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
+  std::vector<hsa_amd_memory_pool_t> *Result =
+      static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
+  bool AllocAllowed = false;
+  hsa_status_t err = hsa_amd_memory_pool_get_info(
+      MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
+      &AllocAllowed);
+  if (err != HSA_STATUS_SUCCESS) {
+    fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n",
+            get_error_string(err));
+    return err;
+  }
+
+  if (!AllocAllowed) {
+    // nothing needs to be done here.
+    return HSA_STATUS_SUCCESS;
+  }
+
+  uint32_t GlobalFlags = 0;
+  err = hsa_amd_memory_pool_get_info(
+      MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
+  if (err != HSA_STATUS_SUCCESS) {
+    fprintf(stderr, "Get memory pool info failed: %s\n", get_error_string(err));
+    return err;
+  }
+
+  fprintf(stderr, "Flags : %d\n", GlobalFlags);
+  if ((GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) &&
+      (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT)) {
+    size_t size = 0;
+    err = hsa_amd_memory_pool_get_info(MemoryPool,
+                                       HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
+    if (err != HSA_STATUS_SUCCESS) {
+      fprintf(stderr, "Get memory pool size failed: %s\n",
+              get_error_string(err));
+      return err;
+    }
+    if (size > 0)
+      Result->push_back(MemoryPool);
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+std::pair<hsa_status_t, hsa_amd_memory_pool_t>
+FindKernargPool(const std::vector<hsa_agent_t> &HSAAgents) {
+  std::vector<hsa_amd_memory_pool_t> KernArgPools;
+  for (const auto &processor : g_atl_machine.processors<ATLCPUProcessor>()) {
+    hsa_agent_t Agent = processor.agent();
+    hsa_status_t err = HSA_STATUS_SUCCESS;
+    err = hsa_amd_agent_iterate_memory_pools(
+        Agent, addKernArgPool, static_cast<void *>(&KernArgPools));
+    if (err != HSA_STATUS_SUCCESS) {
+      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
+             "Iterate all memory pools", get_error_string(err));
+      return {err, hsa_amd_memory_pool_t{}};
+    }
+  }
+
+  if (KernArgPools.empty()) {
+    fprintf(stderr, "Unable to find any valid kernarg pool\n");
+    return {HSA_STATUS_ERROR, hsa_amd_memory_pool_t{}};
+  }
+
+  return {HSA_STATUS_SUCCESS, KernArgPools[0]};
+}
+
 } // namespace
 } // namespace core
 
@@ -344,6 +414,8 @@ public:
   std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
   std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
 
+  hsa_amd_memory_pool_t KernArgPool;
+
   struct atmiFreePtrDeletor {
     void operator()(void *p) {
       core::Runtime::Memfree(p); // ignore failure to free
@@ -477,6 +549,12 @@ public:
       DP("There are %d devices supporting HSA.\n", NumberOfDevices);
     }
 
+    std::tie(err, KernArgPool) = core::FindKernargPool(HSAAgents);
+    if (err != HSA_STATUS_SUCCESS) {
+      DP("Error when reading memory pools\n");
+      return;
+    }
+
     // Init the device info
     HSAQueues.resize(NumberOfDevices);
     FuncGblEntries.resize(NumberOfDevices);
@@ -1543,8 +1621,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
     }
 
     KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id,
-                                   CallStackAddr, e->name,
-                                   kernarg_segment_size));
+                                   CallStackAddr, e->name, kernarg_segment_size,
+                                   DeviceInfo.KernArgPool));
     __tgt_offload_entry entry = *e;
     entry.addr = (void *)&KernelsList.back();
     DeviceInfo.addOffloadEntry(device_id, entry);
-- 
2.7.4