From 5db39796bf08abdc5644ad99c5810321a4e8cfcf Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 4 Jul 2023 09:25:25 -0500 Subject: [PATCH] [libc] Support timing information in libc tests This patch adds the necessary support to provide timing information in `libc` tests. This is useful for determining which tests look what amount of time. We also can use this as a test basis for providing more fine-grained timing when implementing things on the GPU. The main difficulty with this is the fact that the AMDGPU fixed frequency clock operates at an unknown frequency. We need to read this on a per-card basis from the driver and then copy it in. NVPTX on the other hand has a fixed clock at a resolution of 1ns. I have also increased the resolution of the print-outs as the majority of these are below a millisecond for me. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D154446 --- libc/src/__support/GPU/amdgpu/utils.h | 2 +- libc/src/__support/GPU/generic/utils.h | 2 +- libc/src/__support/GPU/nvptx/utils.h | 2 +- libc/startup/gpu/amdgpu/start.cpp | 6 +++ libc/test/UnitTest/LibcTest.cpp | 24 ++++++++++-- libc/utils/gpu/loader/amdgpu/Loader.cpp | 67 ++++++++++++++++++++++++++------- 6 files changed, 83 insertions(+), 20 deletions(-) diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h index f8835da..78e3866 100644 --- a/libc/src/__support/GPU/amdgpu/utils.h +++ b/libc/src/__support/GPU/amdgpu/utils.h @@ -158,7 +158,7 @@ LIBC_INLINE uint64_t processor_clock() { /// Returns a fixed-frequency timestamp. The actual frequency is dependent on /// the card and can only be queried via the driver. -LIBC_INLINE uint64_t fixed_frequrency_clock() { +LIBC_INLINE uint64_t fixed_frequency_clock() { if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_sendmsg_rtnl)) return __builtin_amdgcn_s_sendmsg_rtnl(0x83); else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memrealtime)) diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h index 93cda53..71cc796 100644 --- a/libc/src/__support/GPU/generic/utils.h +++ b/libc/src/__support/GPU/generic/utils.h @@ -69,7 +69,7 @@ LIBC_INLINE void sync_lane(uint64_t) {} LIBC_INLINE uint64_t processor_clock() { return 0; } -LIBC_INLINE uint64_t fixed_frequrency_clock() { return 0; } +LIBC_INLINE uint64_t fixed_frequency_clock() { return 0; } } // namespace gpu } // namespace __llvm_libc diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h index 2bc3527..a419e2b 100644 --- a/libc/src/__support/GPU/nvptx/utils.h +++ b/libc/src/__support/GPU/nvptx/utils.h @@ -142,7 +142,7 @@ LIBC_INLINE uint64_t processor_clock() { } /// Returns a global fixed-frequency timer at nanosecond frequency. -LIBC_INLINE uint64_t fixed_frequrency_clock() { +LIBC_INLINE uint64_t fixed_frequency_clock() { uint64_t nsecs; LIBC_INLINE_ASM("mov.u64 %0, %%globaltimer;" : "=l"(nsecs)); return nsecs; diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp index 9a99955..7081ae6 100644 --- a/libc/startup/gpu/amdgpu/start.cpp +++ b/libc/startup/gpu/amdgpu/start.cpp @@ -15,6 +15,12 @@ extern "C" int main(int argc, char **argv, char **envp); namespace __llvm_libc { +// The AMDGPU architecture provides a fixed frequency clock used for obtaining +// real time. However, the frequency of this clock varies between cards and can +// only be obtained via the driver. The loader will set this so we can use it. +extern "C" [[gnu::visibility("protected")]] uint64_t + [[clang::address_space(4)]] __llvm_libc_clock_freq = 0; + extern "C" uintptr_t __init_array_start[]; extern "C" uintptr_t __init_array_end[]; extern "C" uintptr_t __fini_array_start[]; diff --git a/libc/test/UnitTest/LibcTest.cpp b/libc/test/UnitTest/LibcTest.cpp index 7e9008f..e1887c4 100644 --- a/libc/test/UnitTest/LibcTest.cpp +++ b/libc/test/UnitTest/LibcTest.cpp @@ -15,6 +15,16 @@ #if __STDC_HOSTED__ #include +#elif defined(LIBC_TARGET_ARCH_IS_GPU) +#include "src/__support/GPU/utils.h" +static long clock() { return __llvm_libc::gpu::fixed_frequency_clock(); } +#if LIBC_TARGET_ARCH_IS_NVPTX +uint64_t CLOCKS_PER_SEC = 1000000000UL; +#else +// The AMDGPU loader needs to initialize this at runtime by querying the driver. +extern "C" [[gnu::visibility("protected")]] uint64_t __llvm_libc_clock_freq; +uint64_t CLOCKS_PER_SEC = __llvm_libc_clock_freq; +#endif #else static long clock() { return 0; } #define CLOCKS_PER_SEC 1 @@ -136,14 +146,22 @@ int Test::runTests(const char *TestFilter) { break; case RunContext::RunResult::Pass: tlog << GREEN << "[ OK ] " << RESET << TestName; -#if __STDC_HOSTED__ +#if __STDC_HOSTED__ || defined(LIBC_TARGET_ARCH_IS_GPU) tlog << " (took "; if (start_time > end_time) { tlog << "unknown - try rerunning)\n"; } else { const auto duration = end_time - start_time; - const uint64_t duration_ms = duration * 1000 / CLOCKS_PER_SEC; - tlog << duration_ms << " ms)\n"; + const uint64_t duration_ms = (duration * 1000) / CLOCKS_PER_SEC; + const uint64_t duration_us = (duration * 1000 * 1000) / CLOCKS_PER_SEC; + const uint64_t duration_ns = + (duration * 1000 * 1000 * 1000) / CLOCKS_PER_SEC; + if (duration_ms != 0) + tlog << duration_ms << " ms)\n"; + else if (duration_us != 0) + tlog << duration_us << " us)\n"; + else + tlog << duration_ns << " ns)\n"; } #else tlog << '\n'; diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp index ca6f6ab..b49d576 100644 --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -264,6 +264,30 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, return HSA_STATUS_SUCCESS; } +/// Copies data from the source agent to the destination agent. The source +/// memory must first be pinned explicitly or allocated via HSA. +static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent, + const void *src, hsa_agent_t src_agent, + uint64_t size) { + // Create a memory signal to copy information between the host and device. + hsa_signal_t memory_signal; + if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal)) + return err; + + if (hsa_status_t err = hsa_amd_memory_async_copy( + dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal)) + return err; + + while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0, + UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) + ; + + if (hsa_status_t err = hsa_signal_destroy(memory_signal)) + return err; + + return HSA_STATUS_SUCCESS; +} + int load(int argc, char **argv, char **envp, void *image, size_t size, const LaunchParameters ¶ms) { // Initialize the HSA runtime used to communicate with the device. @@ -388,6 +412,34 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, wavefront_size, rpc_alloc, &tuple)) handle_error(err); + // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU. + void *host_clock_freq; + if (hsa_status_t err = + hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t), + /*flags=*/0, &host_clock_freq)) + handle_error(err); + hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq); + + if (hsa_status_t err = hsa_agent_get_info( + dev_agent, + static_cast(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY), + host_clock_freq)) + handle_error(err); + + hsa_executable_symbol_t freq_sym; + if (hsa_status_t err = hsa_executable_get_symbol_by_name( + executable, "__llvm_libc_clock_freq", &dev_agent, &freq_sym)) + handle_error(err); + + void *freq_addr; + if (hsa_status_t err = hsa_executable_symbol_get_info( + freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &freq_addr)) + handle_error(err); + + if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq, + host_agent, sizeof(uint64_t))) + handle_error(err); + // Obtain a queue with the minimum (power of two) size, used to send commands // to the HSA runtime and launch execution on the device. uint64_t queue_size; @@ -414,12 +466,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, coarsegrained_pool, queue, params, "_start.kd", args)) handle_error(err); - // Create a memory signal and copy the return value back from the device into - // a new buffer. - hsa_signal_t memory_signal; - if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal)) - handle_error(err); - void *host_ret; if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int), @@ -428,14 +474,9 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret); if (hsa_status_t err = - hsa_amd_memory_async_copy(host_ret, host_agent, dev_ret, dev_agent, - sizeof(int), 0, nullptr, memory_signal)) + hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int))) handle_error(err); - while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0, - UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) - ; - // Save the return value and perform basic clean-up. int ret = *static_cast(host_ret); @@ -458,8 +499,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret)) handle_error(err); - if (hsa_status_t err = hsa_signal_destroy(memory_signal)) - handle_error(err); if (hsa_status_t err = hsa_queue_destroy(queue)) handle_error(err); -- 2.7.4