From 3153bdd547c393b6b58e7cfaa5a40b246bf4218e Mon Sep 17 00:00:00 2001 From: Jon Chesterfield Date: Thu, 2 Sep 2021 11:02:37 +0100 Subject: [PATCH] [libomptarget][amdgpu] Drop env variables Use the same debug print as the rest of libomptarget plugins with the same environment control. Also drop the max queue size debugging hook as I don't believe it is still in use, can bring it back near the rest of the env handling in rtl.cpp if someone objects. That makes most of rt.h and all of utils.cpp unused. Clean that up and simplify control flow in a couple of places. Behaviour change is that debug prints that used to use the old environment variable now use the new one and print in slightly different format, and the removal of the max queue size variable. Reviewed By: pdhaliwal Differential Revision: https://reviews.llvm.org/D108784 --- openmp/libomptarget/plugins/amdgpu/CMakeLists.txt | 1 - openmp/libomptarget/plugins/amdgpu/impl/data.cpp | 20 +++--- openmp/libomptarget/plugins/amdgpu/impl/impl.cpp | 10 ++- openmp/libomptarget/plugins/amdgpu/impl/internal.h | 43 ++++-------- openmp/libomptarget/plugins/amdgpu/impl/rt.h | 80 +++++----------------- openmp/libomptarget/plugins/amdgpu/impl/system.cpp | 43 ++++++------ openmp/libomptarget/plugins/amdgpu/impl/utils.cpp | 39 ----------- openmp/libomptarget/plugins/amdgpu/src/rtl.cpp | 30 +++----- 8 files changed, 69 insertions(+), 197 deletions(-) delete mode 100644 openmp/libomptarget/plugins/amdgpu/impl/utils.cpp diff --git a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt index 72cca26..29887c2 100644 --- a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt +++ b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt @@ -67,7 +67,6 @@ add_library(omptarget.rtl.amdgpu SHARED impl/data.cpp impl/get_elf_mach_gfx_name.cpp impl/system.cpp - impl/utils.cpp impl/msgpack.cpp src/rtl.cpp ${LIBOMPTARGET_EXTRA_SOURCE} diff --git a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp index 7cd7e4d..67942a8 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp @@ -17,23 +17,21 @@ using core::TaskImpl; namespace core { - -hsa_status_t Runtime::HostMalloc(void **ptr, size_t size, - hsa_amd_memory_pool_t MemoryPool) { +namespace Runtime { +hsa_status_t HostMalloc(void **ptr, size_t size, + hsa_amd_memory_pool_t MemoryPool) { hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, size, 0, ptr); - DEBUG_PRINT("Malloced %p\n", *ptr); - + DP("Malloced %p\n", *ptr); if (err == HSA_STATUS_SUCCESS) { err = core::allow_access_to_all_gpu_agents(*ptr); } - return (err == HSA_STATUS_SUCCESS) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + return err; } -hsa_status_t Runtime::Memfree(void *ptr) { +hsa_status_t Memfree(void *ptr) { hsa_status_t err = hsa_amd_memory_pool_free(ptr); - DEBUG_PRINT("Freed %p\n", ptr); - - return (err == HSA_STATUS_SUCCESS) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + DP("Freed %p\n", ptr); + return err; } - +} // namespace Runtime } // namespace core diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp index e3f9f71..7f841aa 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp @@ -5,8 +5,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "impl_runtime.h" #include "hsa_api.h" +#include "impl_runtime.h" #include "internal.h" #include "rt.h" #include @@ -32,7 +32,7 @@ static hsa_status_t invoke_hsa_copy(hsa_signal_t sig, void *dest, hsa_signal_value_t got = init; while (got == init) { got = hsa_signal_wait_scacquire(sig, HSA_SIGNAL_CONDITION_NE, init, - UINT64_MAX, ATMI_WAIT_STATE); + UINT64_MAX, HSA_WAIT_STATE_BLOCKED); } if (got != success) { @@ -64,8 +64,7 @@ hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest, void *tempHostPtr; hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool); if (ret != HSA_STATUS_SUCCESS) { - DEBUG_PRINT("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", - size); + DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size); return ret; } std::unique_ptr del(tempHostPtr); @@ -94,8 +93,7 @@ hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *dest, void *tempHostPtr; hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool); if (ret != HSA_STATUS_SUCCESS) { - DEBUG_PRINT("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", - size); + DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size); return ret; } std::unique_ptr del(tempHostPtr); diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h index e5cf047..8bca165 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -24,7 +24,12 @@ #include "hsa_api.h" #include "impl_runtime.h" -#include "rt.h" + +#ifndef TARGET_NAME +#error "Missing TARGET_NAME macro" +#endif +#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" +#include "Debug.h" #define MAX_NUM_KERNELS (1024 * 16) @@ -41,34 +46,6 @@ typedef struct impl_implicit_args_s { unsigned long kernarg_template_ptr; } impl_implicit_args_t; -extern "C" { - -#ifdef DEBUG -#define DEBUG_PRINT(fmt, ...) \ - if (core::Runtime::getInstance().getDebugMode()) { \ - fprintf(stderr, "[%s:%d] " fmt, __FILE__, __LINE__, ##__VA_ARGS__); \ - } -#else -#define DEBUG_PRINT(...) \ - do { \ - } while (false) -#endif - -#ifndef HSA_RUNTIME_INC_HSA_H_ -typedef struct hsa_signal_s { - uint64_t handle; -} hsa_signal_t; -#endif - -} - -/* --------------------------------------------------------------------------------- - * Simulated CPU Data Structures and API - * --------------------------------------------------------------------------------- - */ - -#define ATMI_WAIT_STATE HSA_WAIT_STATE_BLOCKED - // ---------------------- Kernel Start ------------- typedef struct atl_kernel_info_s { uint64_t kernel_object; @@ -110,7 +87,7 @@ struct SignalPoolT { state.pop(); hsa_status_t rc = hsa_signal_destroy(signal); if (rc != HSA_STATUS_SUCCESS) { - DEBUG_PRINT("Signal pool destruction failed\n"); + DP("Signal pool destruction failed\n"); } } } @@ -183,6 +160,10 @@ bool handle_group_signal(hsa_signal_value_t value, void *arg); hsa_status_t allow_access_to_all_gpu_agents(void *ptr); } // namespace core -const char *get_error_string(hsa_status_t err); +inline const char *get_error_string(hsa_status_t err) { + const char *res; + hsa_status_t rc = hsa_status_string(err, &res); + return (rc == HSA_STATUS_SUCCESS) ? res : "HSA_STATUS UNKNOWN."; +} #endif // SRC_RUNTIME_INCLUDE_INTERNAL_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/rt.h b/openmp/libomptarget/plugins/amdgpu/impl/rt.h index 9d98735..8b09a84 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/rt.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/rt.h @@ -8,74 +8,26 @@ #ifndef SRC_RUNTIME_INCLUDE_RT_H_ #define SRC_RUNTIME_INCLUDE_RT_H_ -#include "impl_runtime.h" #include "hsa_api.h" +#include "impl_runtime.h" +#include "internal.h" + #include namespace core { - -#define DEFAULT_MAX_QUEUE_SIZE 4096 -#define DEFAULT_DEBUG_MODE 0 -class Environment { -public: - Environment() - : max_queue_size_(DEFAULT_MAX_QUEUE_SIZE), - debug_mode_(DEFAULT_DEBUG_MODE) { - GetEnvAll(); - } - - void GetEnvAll(); - - int getMaxQueueSize() const { return max_queue_size_; } - int getDebugMode() const { return debug_mode_; } - -private: - std::string GetEnv(const char *name) { - char *env = getenv(name); - std::string ret; - if (env) { - ret = env; - } - return ret; - } - - int max_queue_size_; - int debug_mode_; -}; - -class Runtime final { -public: - static Runtime &getInstance() { - static Runtime instance; - return instance; - } - - // modules - static hsa_status_t RegisterModuleFromMemory( - void *, size_t, hsa_agent_t agent, - hsa_status_t (*on_deserialized_data)(void *data, size_t size, - void *cb_state), - void *cb_state, std::vector &HSAExecutables); - - // data - static hsa_status_t Memcpy(hsa_signal_t, void *, const void *, size_t); - static hsa_status_t Memfree(void *); - static hsa_status_t HostMalloc(void **ptr, size_t size, - hsa_amd_memory_pool_t MemoryPool); - - int getMaxQueueSize() const { return env_.getMaxQueueSize(); } - int getDebugMode() const { return env_.getDebugMode(); } - -protected: - Runtime() = default; - ~Runtime() = default; - Runtime(const Runtime &) = delete; - Runtime &operator=(const Runtime &) = delete; - -protected: - // variable to track environment variables - Environment env_; -}; +namespace Runtime { +hsa_status_t Memfree(void *); +hsa_status_t HostMalloc(void **ptr, size_t size, + hsa_amd_memory_pool_t MemoryPool); + +} // namespace Runtime +hsa_status_t RegisterModuleFromMemory( + std::map &KernelInfoTable, + std::map &SymbolInfoTable, + void *module_bytes, size_t module_size, hsa_agent_t agent, + hsa_status_t (*on_deserialized_data)(void *data, size_t size, + void *cb_state), + void *cb_state, std::vector &HSAExecutables); } // namespace core diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp index 06a8d34..fd10bef 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -525,8 +525,8 @@ static hsa_status_t get_code_object_custom_metadata( size_t padding = new_offset - offset; offset = new_offset; info.arg_offsets.push_back(lcArg.offset_); - DEBUG_PRINT("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), - lcArg.size_, lcArg.offset_); + DP("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), lcArg.size_, + lcArg.offset_); offset += lcArg.size_; // check if the arg is a hidden/implicit arg @@ -541,13 +541,13 @@ static hsa_status_t get_code_object_custom_metadata( } // add size of implicit args, e.g.: offset x, y and z and pipe pointer, but - // in ATMI, do not count the compiler set implicit args, but set your own - // implicit args by discounting the compiler set implicit args + // do not count the compiler set implicit args, but set your own implicit + // args by discounting the compiler set implicit args info.kernel_segment_size = (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size) + sizeof(impl_implicit_args_t); - DEBUG_PRINT("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(), - kernel_segment_size, info.kernel_segment_size); + DP("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(), + kernel_segment_size, info.kernel_segment_size); // kernel received, now add it to the kernel info table KernelInfoTable[kernelName] = info; @@ -571,7 +571,7 @@ populate_InfoTables(hsa_executable_symbol_t symbol, "Symbol info extraction", get_error_string(err)); return err; } - DEBUG_PRINT("Exec Symbol type: %d\n", type); + DP("Exec Symbol type: %d\n", type); if (type == HSA_SYMBOL_KIND_KERNEL) { err = hsa_executable_symbol_get_info( symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length); @@ -636,11 +636,10 @@ populate_InfoTables(hsa_executable_symbol_t symbol, return err; } - DEBUG_PRINT( - "Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes " - "kernarg\n", - kernelName.c_str(), info.kernel_object, info.group_segment_size, - info.private_segment_size, info.kernel_segment_size); + DP("Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes " + "kernarg\n", + kernelName.c_str(), info.kernel_object, info.group_segment_size, + info.private_segment_size, info.kernel_segment_size); // assign it back to the kernel info table KernelInfoTable[kernelName] = info; @@ -681,12 +680,11 @@ populate_InfoTables(hsa_executable_symbol_t symbol, return err; } - DEBUG_PRINT("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr, - info.size); + DP("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr, info.size); SymbolInfoTable[std::string(name)] = info; free(name); } else { - DEBUG_PRINT("Symbol is an indirect function\n"); + DP("Symbol is an indirect function\n"); } return HSA_STATUS_SUCCESS; } @@ -730,9 +728,8 @@ hsa_status_t RegisterModuleFromMemory( err = get_code_object_custom_metadata(module_bytes, module_size, KernelInfoTable); if (err != HSA_STATUS_SUCCESS) { - DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Getting custom code object metadata", - get_error_string(err)); + DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Getting custom code object metadata", get_error_string(err)); continue; } @@ -741,8 +738,8 @@ hsa_status_t RegisterModuleFromMemory( err = hsa_code_object_deserialize(module_bytes, module_size, NULL, &code_object); if (err != HSA_STATUS_SUCCESS) { - DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Code Object Deserialization", get_error_string(err)); + DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Code Object Deserialization", get_error_string(err)); continue; } assert(0 != code_object.handle); @@ -763,8 +760,8 @@ hsa_status_t RegisterModuleFromMemory( err = hsa_executable_load_code_object(executable, agent, code_object, NULL); if (err != HSA_STATUS_SUCCESS) { - DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Loading the code object", get_error_string(err)); + DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Loading the code object", get_error_string(err)); continue; } @@ -772,7 +769,7 @@ hsa_status_t RegisterModuleFromMemory( } module_load_success = true; } while (0); - DEBUG_PRINT("Modules loaded successful? %d\n", module_load_success); + DP("Modules loaded successful? %d\n", module_load_success); if (module_load_success) { /* Freeze the executable; it can now be queried for symbols. */ err = hsa_executable_freeze(executable, ""); diff --git a/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp b/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp deleted file mode 100644 index a0a9d41..0000000 --- a/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp +++ /dev/null @@ -1,39 +0,0 @@ -//===--- amdgpu/impl/utils.cpp ------------------------------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include "internal.h" -#include "rt.h" - -#include -#include - -const char *get_error_string(hsa_status_t err) { - const char *res; - hsa_status_t rc = hsa_status_string(err, &res); - return (rc == HSA_STATUS_SUCCESS) ? res : "HSA_STATUS UNKNOWN."; -} - -namespace core { -/* - * Environment variables - */ -void Environment::GetEnvAll() { - std::string var = GetEnv("ATMI_HELP"); - if (!var.empty()) { - printf("ATMI_MAX_HSA_QUEUE_SIZE : positive integer\n" - "ATMI_DEBUG : 1 for printing out trace/debug info\n"); - } - - var = GetEnv("ATMI_MAX_HSA_QUEUE_SIZE"); - if (!var.empty()) - max_queue_size_ = std::stoi(var); - - var = GetEnv("ATMI_DEBUG"); - if (!var.empty()) - debug_mode_ = std::stoi(var); -} -} // namespace core diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 2b131a2..bbf73b1 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -28,18 +28,14 @@ #include "impl_runtime.h" #include "internal.h" +#include "rt.h" -#include "Debug.h" #include "get_elf_mach_gfx_name.h" #include "omptargetplugin.h" #include "print_tracing.h" #include "llvm/Frontend/OpenMP/OMPGridValues.h" -#ifndef TARGET_NAME -#error "Missing TARGET_NAME macro" -#endif -#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" // hostrpc interface, FIXME: consider moving to its own include these are // statically linked into amdgpu/plugin if present from hostrpc_services.a, @@ -84,16 +80,6 @@ int print_kernel_trace; #include "elf_common.h" -namespace core { -hsa_status_t RegisterModuleFromMemory( - std::map &KernelInfo, - std::map &SymbolInfoTable, void *, size_t, - hsa_agent_t agent, - hsa_status_t (*on_deserialized_data)(void *data, size_t size, - void *cb_state), - void *cb_state, std::vector &HSAExecutables); -} - namespace hsa { template hsa_status_t iterate_agents(C cb) { auto L = [](hsa_agent_t agent, void *data) -> hsa_status_t { @@ -713,7 +699,7 @@ public: return; } - // Init hostcall soon after initializing ATMI + // Init hostcall soon after initializing hsa hostrpc_init(); err = FindAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) { @@ -775,8 +761,9 @@ public: DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", i); return; } - if (queue_size > core::Runtime::getInstance().getMaxQueueSize()) { - queue_size = core::Runtime::getInstance().getMaxQueueSize(); + enum { MaxQueueSize = 4096 }; + if (queue_size > MaxQueueSize) { + queue_size = MaxQueueSize; } } @@ -819,7 +806,7 @@ public: // impl_finalize removes access to it deviceStateStore.clear(); KernelArgPoolMap.clear(); - // Terminate hostrpc before finalizing ATMI + // Terminate hostrpc before finalizing hsa hostrpc_terminate(); hsa_status_t Err; @@ -1530,7 +1517,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, } } - DP("ATMI module successfully loaded!\n"); + DP("AMDGPU module successfully loaded!\n"); { // the device_State array is either large value in bss or a void* that @@ -2197,8 +2184,7 @@ int32_t __tgt_rtl_run_target_team_region_locked( memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *)); } - // Initialize implicit arguments. ATMI seems to leave most fields - // uninitialized + // Initialize implicit arguments. TODO: Which of these can be dropped impl_implicit_args_t *impl_args = reinterpret_cast( static_cast(kernarg) + ArgPool->kernarg_segment_size); -- 2.7.4