From 421b1f55c6e21ab7eadc7737c1757b06577f1d71 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 31 May 2022 10:08:25 -0400 Subject: [PATCH] [Libomptarget] Do not use retaining attributes for the static library When we build the libomptarget device runtime library targeting bitcode, we need special care to make sure that certain functions are not optimized out. This is because we manually internalize and optimize these definitions, ignoring their standard linkage semantics. When we build with the static library, we can maintain these semantics and we do not need these to be kept-alive. Furthermore, if they are kept-alive it prevents them from being removed during LTO. This prevents us from completely internalizing `IsSPMDMode` and removing several other functions. This patch removes these for the static library target by using a macro definition to enable them. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D126701 --- openmp/libomptarget/DeviceRTL/CMakeLists.txt | 4 ++-- openmp/libomptarget/DeviceRTL/include/Types.h | 7 +++++++ openmp/libomptarget/DeviceRTL/src/Mapping.cpp | 2 +- openmp/libomptarget/DeviceRTL/src/Utils.cpp | 2 +- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt index 1a38492..540139d 100644 --- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -234,12 +234,12 @@ endfunction() # Generate a Bitcode library for all the compute capabilities the user requested add_custom_target(omptarget.devicertl.nvptx) foreach(sm ${nvptx_sm_list}) - compileDeviceRTLLibrary(sm_${sm} nvptx -target nvptx64-nvidia-cuda -Xclang -target-feature -Xclang +ptx61 "-D__CUDA_ARCH__=${sm}0") + compileDeviceRTLLibrary(sm_${sm} nvptx -target nvptx64-nvidia-cuda -DLIBOMPTARGET_BC_TARGET -Xclang -target-feature -Xclang +ptx61 "-D__CUDA_ARCH__=${sm}0") endforeach() add_custom_target(omptarget.devicertl.amdgpu) foreach(mcpu ${amdgpu_mcpus}) - compileDeviceRTLLibrary(${mcpu} amdgpu -target amdgcn-amd-amdhsa -D__AMDGCN__ -nogpulib) + compileDeviceRTLLibrary(${mcpu} amdgpu -target amdgcn-amd-amdhsa -DLIBOMPTARGET_BC_TARGET -D__AMDGCN__ -nogpulib) endforeach() set(LIBOMPTARGET_LLVM_VERSION "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}") diff --git a/openmp/libomptarget/DeviceRTL/include/Types.h b/openmp/libomptarget/DeviceRTL/include/Types.h index 34f3e92..fcb75e3 100644 --- a/openmp/libomptarget/DeviceRTL/include/Types.h +++ b/openmp/libomptarget/DeviceRTL/include/Types.h @@ -209,6 +209,13 @@ enum OMPTgtExecModeFlags : int8_t { #define CONSTANT(NAME) \ NAME [[clang::loader_uninitialized, clang::address_space(4)]] +// Attribute to keep alive certain definition for the bitcode library. +#ifdef LIBOMPTARGET_BC_TARGET +#define KEEP_ALIVE __attribute__((used, retain)) +#else +#define KEEP_ALIVE +#endif + ///} #endif diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp index 12ef58f..172bbbf 100644 --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -276,7 +276,7 @@ uint32_t mapping::getNumberOfProcessorElements() { // TODO: This is a workaround for initialization coming from kernels outside of // the TU. We will need to solve this more correctly in the future. -int __attribute__((used, retain, weak)) SHARED(IsSPMDMode); +int __attribute__((weak)) KEEP_ALIVE SHARED(IsSPMDMode); void mapping::init(bool IsSPMD) { if (mapping::isInitialThreadInLevel0(IsSPMD)) diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp index e6bcba8..453d131 100644 --- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp @@ -21,7 +21,7 @@ using namespace _OMP; namespace _OMP { /// Helper to keep code alive without introducing a performance penalty. -__attribute__((used, retain, weak, optnone, cold)) void keepAlive() { +__attribute__((weak, optnone, cold)) KEEP_ALIVE void keepAlive() { __kmpc_get_hardware_thread_id_in_block(); __kmpc_get_hardware_num_threads_in_block(); __kmpc_get_warp_size(); -- 2.7.4