From f5f329a37175c3a30e9d8b20b95fced96b9dde8c Mon Sep 17 00:00:00 2001 From: Pushpinder Singh Date: Thu, 3 Jun 2021 08:58:10 +0000 Subject: [PATCH] [AMDGPU][Libomptarget] Rework logic for locating kernarg pools Previous logic was to always use the first kernarg pool found to allocate kernel args. This patch changes this to use only the kernarg pool which has non-zero size. This logic is also reworked to not use any globals. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D103600 --- openmp/libomptarget/plugins/amdgpu/impl/internal.h | 2 - openmp/libomptarget/plugins/amdgpu/impl/system.cpp | 6 -- openmp/libomptarget/plugins/amdgpu/src/rtl.cpp | 94 ++++++++++++++++++++-- 3 files changed, 86 insertions(+), 16 deletions(-) diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h index 7a8f0fd..0cdd5ee 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -181,8 +181,6 @@ private: }; }; -extern std::vector atl_gpu_kernarg_pools; - namespace core { hsa_status_t atl_init_gpu_context(); diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp index 85e79dc..9377e3a 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -142,8 +142,6 @@ static const std::map ArgValueKind = { ATLMachine g_atl_machine; -std::vector atl_gpu_kernarg_pools; - /* atlc is all internal global values. The structure atl_context_t is defined in atl_internal.h @@ -198,10 +196,6 @@ static hsa_status_t get_memory_pool_info(hsa_amd_memory_pool_t memory_pool, if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) { ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED); proc->addMemory(new_mem); - if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & global_flag) { - DEBUG_PRINT("GPU kernel args pool handle: %lu\n", memory_pool.handle); - atl_gpu_kernarg_pools.push_back(memory_pool); - } } else { ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_COARSE_GRAINED); proc->addMemory(new_mem); diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 344c1be..89345b6 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -37,6 +37,7 @@ #include "Debug.h" #include "get_elf_mach_gfx_name.h" +#include "machine.h" #include "omptargetplugin.h" #include "print_tracing.h" @@ -136,15 +137,15 @@ public: KernelArgPool(const KernelArgPool &) = delete; KernelArgPool(KernelArgPool &&) = delete; - KernelArgPool(uint32_t kernarg_segment_size) + KernelArgPool(uint32_t kernarg_segment_size, + hsa_amd_memory_pool_t &memory_pool) : kernarg_segment_size(kernarg_segment_size) { // atmi uses one pool per kernel for all gpus, with a fixed upper size // preserving that exact scheme here, including the queue hsa_status_t err = hsa_amd_memory_pool_allocate( - atl_gpu_kernarg_pools[0], - kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0, + memory_pool, kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0, &kernarg_region); if (err != HSA_STATUS_SUCCESS) { @@ -224,7 +225,8 @@ struct KernelTy { KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id, void *_CallStackAddr, const char *_Name, - uint32_t _kernarg_segment_size) + uint32_t _kernarg_segment_size, + hsa_amd_memory_pool_t &KernArgMemoryPool) : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize), device_id(_device_id), CallStackAddr(_CallStackAddr), Name(_Name) { DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); @@ -232,8 +234,8 @@ struct KernelTy { std::string N(_Name); if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { KernelArgPoolMap.insert( - std::make_pair(N, std::unique_ptr( - new KernelArgPool(_kernarg_segment_size)))); + std::make_pair(N, std::unique_ptr(new KernelArgPool( + _kernarg_segment_size, KernArgMemoryPool)))); } } }; @@ -297,6 +299,74 @@ uint16_t create_header() { header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; return header; } + +hsa_status_t addKernArgPool(hsa_amd_memory_pool_t MemoryPool, void *Data) { + std::vector *Result = + static_cast *>(Data); + bool AllocAllowed = false; + hsa_status_t err = hsa_amd_memory_pool_get_info( + MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, + &AllocAllowed); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n", + get_error_string(err)); + return err; + } + + if (!AllocAllowed) { + // nothing needs to be done here. + return HSA_STATUS_SUCCESS; + } + + uint32_t GlobalFlags = 0; + err = hsa_amd_memory_pool_get_info( + MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "Get memory pool info failed: %s\n", get_error_string(err)); + return err; + } + + fprintf(stderr, "Flags : %d\n", GlobalFlags); + if ((GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) && + (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT)) { + size_t size = 0; + err = hsa_amd_memory_pool_get_info(MemoryPool, + HSA_AMD_MEMORY_POOL_INFO_SIZE, &size); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "Get memory pool size failed: %s\n", + get_error_string(err)); + return err; + } + if (size > 0) + Result->push_back(MemoryPool); + } + + return HSA_STATUS_SUCCESS; +} + +std::pair +FindKernargPool(const std::vector &HSAAgents) { + std::vector KernArgPools; + for (const auto &processor : g_atl_machine.processors()) { + hsa_agent_t Agent = processor.agent(); + hsa_status_t err = HSA_STATUS_SUCCESS; + err = hsa_amd_agent_iterate_memory_pools( + Agent, addKernArgPool, static_cast(&KernArgPools)); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Iterate all memory pools", get_error_string(err)); + return {err, hsa_amd_memory_pool_t{}}; + } + } + + if (KernArgPools.empty()) { + fprintf(stderr, "Unable to find any valid kernarg pool\n"); + return {HSA_STATUS_ERROR, hsa_amd_memory_pool_t{}}; + } + + return {HSA_STATUS_SUCCESS, KernArgPools[0]}; +} + } // namespace } // namespace core @@ -344,6 +414,8 @@ public: std::vector> KernelInfoTable; std::vector> SymbolInfoTable; + hsa_amd_memory_pool_t KernArgPool; + struct atmiFreePtrDeletor { void operator()(void *p) { core::Runtime::Memfree(p); // ignore failure to free @@ -477,6 +549,12 @@ public: DP("There are %d devices supporting HSA.\n", NumberOfDevices); } + std::tie(err, KernArgPool) = core::FindKernargPool(HSAAgents); + if (err != HSA_STATUS_SUCCESS) { + DP("Error when reading memory pools\n"); + return; + } + // Init the device info HSAQueues.resize(NumberOfDevices); FuncGblEntries.resize(NumberOfDevices); @@ -1543,8 +1621,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, } KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id, - CallStackAddr, e->name, - kernarg_segment_size)); + CallStackAddr, e->name, kernarg_segment_size, + DeviceInfo.KernArgPool)); __tgt_offload_entry entry = *e; entry.addr = (void *)&KernelsList.back(); DeviceInfo.addOffloadEntry(device_id, entry); -- 2.7.4