From 6764301a6bd3ffe80f3469005cbe5dffd41ba073 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 4 Jul 2023 12:31:28 -0500 Subject: [PATCH] [Libomptarget] Correctly implement `getWTime` on AMDGPU AMDGPU provides a fixed frequency clock since some generations back. However, the frequency is variable by card and must be looked up at runtime. This patch adds a new device environment line for the clock frequency so that we can use it in the same way as NVPTX. This is the correct implementation and the version in ASO should be replaced. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D154456 --- .../libomptarget/DeviceRTL/include/Configuration.h | 3 +++ .../libomptarget/DeviceRTL/src/Configuration.cpp | 4 ++++ openmp/libomptarget/DeviceRTL/src/Misc.cpp | 22 ++++++++++++++++------ openmp/libomptarget/include/DeviceEnvironment.h | 1 + .../plugins-nextgen/amdgpu/src/rtl.cpp | 11 +++++++++++ .../common/PluginInterface/PluginInterface.cpp | 1 + .../common/PluginInterface/PluginInterface.h | 1 + .../libomptarget/plugins-nextgen/cuda/src/rtl.cpp | 3 +++ openmp/libomptarget/test/offloading/wtime.c | 15 +++++++-------- 9 files changed, 47 insertions(+), 14 deletions(-) diff --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h index 09bce10..068c016 100644 --- a/openmp/libomptarget/DeviceRTL/include/Configuration.h +++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h @@ -37,6 +37,9 @@ uint32_t getDebugKind(); /// Return the amount of dynamic shared memory that was allocated at launch. uint64_t getDynamicMemorySize(); +/// Returns the cycles per second of the device's fixed frequency clock. +uint64_t getClockFrequency(); + /// Return if debugging is enabled for the given debug kind. bool isDebugMode(DebugKind Level); diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp index ceccef6..994ff2b 100644 --- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp @@ -46,6 +46,10 @@ uint64_t config::getDynamicMemorySize() { return __omp_rtl_device_environment.DynamicMemSize; } +uint64_t config::getClockFrequency() { + return __omp_rtl_device_environment.ClockFrequency; +} + bool config::isDebugMode(config::DebugKind Kind) { return config::getDebugKind() & Kind; } diff --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp index 68ce445..a19a263 100644 --- a/openmp/libomptarget/DeviceRTL/src/Misc.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp @@ -9,6 +9,7 @@ // //===----------------------------------------------------------------------===// +#include "Configuration.h" #include "Types.h" #include "Debug.h" @@ -27,14 +28,23 @@ double getWTime(); ///{ #pragma omp begin declare variant match(device = {arch(amdgcn)}) -double getWTick() { return ((double)1E-9); } +double getWTick() { + // The number of ticks per second for the AMDGPU clock varies by card and can + // only be retrived by querying the driver. We rely on the device environment + // to inform us what the proper frequency is. + return 1.0 / config::getClockFrequency(); +} double getWTime() { - // The intrinsics for measuring time have undocumented frequency - // This will probably need to be found by measurement on a number of - // architectures. Until then, return 0, which is very inaccurate as a - // timer but resolves the undefined symbol at link time. - return 0; + uint64_t NumTicks = 0; + if constexpr (__has_builtin(__builtin_amdgcn_s_sendmsg_rtnl)) + NumTicks = __builtin_amdgcn_s_sendmsg_rtnl(0x83); + else if constexpr (__has_builtin(__builtin_amdgcn_s_memrealtime)) + NumTicks = __builtin_amdgcn_s_memrealtime(); + else if constexpr (__has_builtin(__builtin_amdgcn_s_memtime)) + NumTicks = __builtin_amdgcn_s_memtime(); + + return static_cast(NumTicks) * getWTick(); } #pragma omp end declare variant diff --git a/openmp/libomptarget/include/DeviceEnvironment.h b/openmp/libomptarget/include/DeviceEnvironment.h index 231492c..4260002 100644 --- a/openmp/libomptarget/include/DeviceEnvironment.h +++ b/openmp/libomptarget/include/DeviceEnvironment.h @@ -20,6 +20,7 @@ struct DeviceEnvironmentTy { uint32_t NumDevices; uint32_t DeviceNum; uint32_t DynamicMemSize; + uint64_t ClockFrequency; }; #endif diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index 21436ba..1fcbcf2 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -1596,6 +1596,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Err; GridValues.GV_Warp_Size = WavefrontSize; + // Get the frequency of the steady clock. + if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY, + ClockFrequency)) + return Err; + // Load the grid values dependending on the wavefront. if (WavefrontSize == 32) GridValues = getAMDGPUGridValues<32>(); @@ -1757,6 +1762,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// See GenericDeviceTy::getComputeUnitKind(). std::string getComputeUnitKind() const override { return ComputeUnitKind; } + /// Returns the clock frequency for the given AMDGPU device. + uint64_t getClockFrequency() const override { return ClockFrequency; } + /// Allocate and construct an AMDGPU kernel. Expected constructKernelEntry(const __tgt_offload_entry &KernelEntry, @@ -2417,6 +2425,9 @@ private: /// The GPU architecture. std::string ComputeUnitKind; + /// The frequency of the steady clock inside the device. + uint64_t ClockFrequency; + /// Reference to the host device. AMDHostDeviceTy &HostDevice; diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp index c86b2eb..9eaaaf8 100644 --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -509,6 +509,7 @@ Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin, // TODO: The device ID used here is not the real device ID used by OpenMP. DeviceEnvironment.DeviceNum = DeviceId; DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize; + DeviceEnvironment.ClockFrequency = getClockFrequency(); // Create the metainfo of the device environment global. GlobalTy DevEnvGlobal("__omp_rtl_device_environment", diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h index 189406a..8fe615b 100644 --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -732,6 +732,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy { return GridValues.GV_Default_Num_Teams; } uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; } + virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; } /// Get target compute unit kind (e.g., sm_80, or gfx908). virtual std::string getComputeUnitKind() const { return "unknown"; } diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp index d823cbe..c165b58 100644 --- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -829,6 +829,9 @@ struct CUDADeviceTy : public GenericDeviceTy { return ComputeCapability.str(); } + /// Returns the clock frequency for the given NVPTX device. + uint64_t getClockFrequency() const override { return 1000000000; } + private: using CUDAStreamManagerTy = GenericDeviceResourceManagerTy; using CUDAEventManagerTy = GenericDeviceResourceManagerTy; diff --git a/openmp/libomptarget/test/offloading/wtime.c b/openmp/libomptarget/test/offloading/wtime.c index 230d67b..2ba60aa 100644 --- a/openmp/libomptarget/test/offloading/wtime.c +++ b/openmp/libomptarget/test/offloading/wtime.c @@ -1,7 +1,6 @@ -// RUN: %libomptarget-compileopt-run-and-check-generic - -// UNSUPPORTED: amdgcn-amd-amdhsa +// RUN: %libomptarget-compileopt-and-run-generic +#include #include #include #include @@ -10,17 +9,17 @@ int main(int argc, char *argv[]) { int *data = (int *)malloc(N * sizeof(int)); -#pragma omp target map(from : data[0 : N]) + double duration = 0.0; + +#pragma omp target map(from : data[0 : N]) map(from : duration) { double start = omp_get_wtime(); for (int i = 0; i < N; ++i) data[i] = i; double end = omp_get_wtime(); - double duration = end - start; - printf("duration: %lfs\n", duration); + duration = end - start; } + assert(duration > 0.0); free(data); return 0; } - -// CHECK: duration: {{.+[1-9]+}} -- 2.7.4