[AMDGPU][Libomptarget] Rework logic for locating kernarg pools

author Pushpinder Singh <Pushpinder.Singh@amd.com>

Thu, 3 Jun 2021 08:58:10 +0000 (08:58 +0000)

committer Pushpinder Singh <Pushpinder.Singh@amd.com>

Mon, 7 Jun 2021 06:41:37 +0000 (06:41 +0000)
author Pushpinder Singh <Pushpinder.Singh@amd.com>
Thu, 3 Jun 2021 08:58:10 +0000 (08:58 +0000)
committer Pushpinder Singh <Pushpinder.Singh@amd.com>
Mon, 7 Jun 2021 06:41:37 +0000 (06:41 +0000)
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h

index 7a8f0fd..0cdd5ee 100644 (file)
--- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
@@ -181,8 +181,6 @@ private:
    };
  };
  
-extern std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
-
  namespace core {
  hsa_status_t atl_init_gpu_context();
  
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp

index 85e79dc..9377e3a 100644 (file)
--- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
@@ -142,8 +142,6 @@ static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = {
  
  ATLMachine g_atl_machine;
  
-std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
-
  /*
     atlc is all internal global values.
     The structure atl_context_t is defined in atl_internal.h
@@ -198,10 +196,6 @@ static hsa_status_t get_memory_pool_info(hsa_amd_memory_pool_t memory_pool,
      if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) {
        ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED);
        proc->addMemory(new_mem);
-      if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & global_flag) {
-        DEBUG_PRINT("GPU kernel args pool handle: %lu\n", memory_pool.handle);
-        atl_gpu_kernarg_pools.push_back(memory_pool);
-      }
      } else {
        ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_COARSE_GRAINED);
        proc->addMemory(new_mem);
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

index 344c1be..89345b6 100644 (file)
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -37,6 +37,7 @@
  
  #include "Debug.h"
  #include "get_elf_mach_gfx_name.h"
+#include "machine.h"
  #include "omptargetplugin.h"
  #include "print_tracing.h"
  
@@ -136,15 +137,15 @@ public:
    KernelArgPool(const KernelArgPool &) = delete;
    KernelArgPool(KernelArgPool &&) = delete;
  
-  KernelArgPool(uint32_t kernarg_segment_size)
+  KernelArgPool(uint32_t kernarg_segment_size,
+                hsa_amd_memory_pool_t &memory_pool)
        : kernarg_segment_size(kernarg_segment_size) {
  
      // atmi uses one pool per kernel for all gpus, with a fixed upper size
      // preserving that exact scheme here, including the queue<int>
  
      hsa_status_t err = hsa_amd_memory_pool_allocate(
-        atl_gpu_kernarg_pools[0],
-        kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0,
+        memory_pool, kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0,
          &kernarg_region);
  
      if (err != HSA_STATUS_SUCCESS) {
@@ -224,7 +225,8 @@ struct KernelTy {
  
    KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id,
             void *_CallStackAddr, const char *_Name,
-           uint32_t _kernarg_segment_size)
+           uint32_t _kernarg_segment_size,
+           hsa_amd_memory_pool_t &KernArgMemoryPool)
        : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize),
          device_id(_device_id), CallStackAddr(_CallStackAddr), Name(_Name) {
      DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
@@ -232,8 +234,8 @@ struct KernelTy {
      std::string N(_Name);
      if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
        KernelArgPoolMap.insert(
-          std::make_pair(N, std::unique_ptr<KernelArgPool>(
-                                new KernelArgPool(_kernarg_segment_size))));
+          std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
+                                _kernarg_segment_size, KernArgMemoryPool))));
      }
    }
  };
@@ -297,6 +299,74 @@ uint16_t create_header() {
    header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
    return header;
  }
+
+hsa_status_t addKernArgPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
+  std::vector<hsa_amd_memory_pool_t> *Result =
+      static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
+  bool AllocAllowed = false;
+  hsa_status_t err = hsa_amd_memory_pool_get_info(
+      MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
+      &AllocAllowed);
+  if (err != HSA_STATUS_SUCCESS) {
+    fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n",
+            get_error_string(err));
+    return err;
+  }
+
+  if (!AllocAllowed) {
+    // nothing needs to be done here.
+    return HSA_STATUS_SUCCESS;
+  }
+
+  uint32_t GlobalFlags = 0;
+  err = hsa_amd_memory_pool_get_info(
+      MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
+  if (err != HSA_STATUS_SUCCESS) {
+    fprintf(stderr, "Get memory pool info failed: %s\n", get_error_string(err));
+    return err;
+  }
+
+  fprintf(stderr, "Flags : %d\n", GlobalFlags);
+  if ((GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) &&
+      (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT)) {
+    size_t size = 0;
+    err = hsa_amd_memory_pool_get_info(MemoryPool,
+                                       HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
+    if (err != HSA_STATUS_SUCCESS) {
+      fprintf(stderr, "Get memory pool size failed: %s\n",
+              get_error_string(err));
+      return err;
+    }
+    if (size > 0)
+      Result->push_back(MemoryPool);
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+std::pair<hsa_status_t, hsa_amd_memory_pool_t>
+FindKernargPool(const std::vector<hsa_agent_t> &HSAAgents) {
+  std::vector<hsa_amd_memory_pool_t> KernArgPools;
+  for (const auto &processor : g_atl_machine.processors<ATLCPUProcessor>()) {
+    hsa_agent_t Agent = processor.agent();
+    hsa_status_t err = HSA_STATUS_SUCCESS;
+    err = hsa_amd_agent_iterate_memory_pools(
+        Agent, addKernArgPool, static_cast<void *>(&KernArgPools));
+    if (err != HSA_STATUS_SUCCESS) {
+      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
+             "Iterate all memory pools", get_error_string(err));
+      return {err, hsa_amd_memory_pool_t{}};
+    }
+  }
+
+  if (KernArgPools.empty()) {
+    fprintf(stderr, "Unable to find any valid kernarg pool\n");
+    return {HSA_STATUS_ERROR, hsa_amd_memory_pool_t{}};
+  }
+
+  return {HSA_STATUS_SUCCESS, KernArgPools[0]};
+}
+
  } // namespace
  } // namespace core
  
@@ -344,6 +414,8 @@ public:
    std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
    std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
  
+  hsa_amd_memory_pool_t KernArgPool;
+
    struct atmiFreePtrDeletor {
      void operator()(void *p) {
        core::Runtime::Memfree(p); // ignore failure to free
@@ -477,6 +549,12 @@ public:
        DP("There are %d devices supporting HSA.\n", NumberOfDevices);
      }
  
+    std::tie(err, KernArgPool) = core::FindKernargPool(HSAAgents);
+    if (err != HSA_STATUS_SUCCESS) {
+      DP("Error when reading memory pools\n");
+      return;
+    }
+
      // Init the device info
      HSAQueues.resize(NumberOfDevices);
      FuncGblEntries.resize(NumberOfDevices);
@@ -1543,8 +1621,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
      }
  
      KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id,
-                                   CallStackAddr, e->name,
-                                   kernarg_segment_size));
+                                   CallStackAddr, e->name, kernarg_segment_size,
+                                   DeviceInfo.KernArgPool));
      __tgt_offload_entry entry = *e;
      entry.addr = (void *)&KernelsList.back();
      DeviceInfo.addOffloadEntry(device_id, entry);
author	Pushpinder Singh <Pushpinder.Singh@amd.com>
	Thu, 3 Jun 2021 08:58:10 +0000 (08:58 +0000)
committer	Pushpinder Singh <Pushpinder.Singh@amd.com>
	Mon, 7 Jun 2021 06:41:37 +0000 (06:41 +0000)
openmp/libomptarget/plugins/amdgpu/impl/internal.h		patch \| blob \| history
openmp/libomptarget/plugins/amdgpu/impl/system.cpp		patch \| blob \| history
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp		patch \| blob \| history