From 77579b99e9ce1638ca696fa7c3872ae8668d997d Mon Sep 17 00:00:00 2001 From: Jon Chesterfield Date: Thu, 19 Aug 2021 13:25:41 +0100 Subject: [PATCH] [openmp][nfc] Replace OMPGridValues array with struct [nfc] Replaces enum indices into an array with a struct. Named the fields to match the enum, leaves memory layout and initialization unchanged. Motivation is to later safely remove dead fields and replace redundant ones with (compile time) computation. It should also be possible to factor some common fields into a base and introduce a gfx10 amdgpu instance with less duplication than the arrays of integers require. Reviewed By: ronlieb Differential Revision: https://reviews.llvm.org/D108339 --- clang/include/clang/Basic/TargetInfo.h | 10 ++--- clang/lib/Basic/Targets/AMDGPU.cpp | 2 +- clang/lib/Basic/Targets/NVPTX.cpp | 2 +- clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp | 3 +- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 13 +++--- clang/lib/CodeGen/CGOpenMPRuntimeGPU.h | 1 - llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h | 51 ++++++++++++----------- openmp/libomptarget/plugins/amdgpu/src/rtl.cpp | 16 +++---- 8 files changed, 47 insertions(+), 51 deletions(-) diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index 21289b0..ab85594 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -210,8 +210,8 @@ protected: unsigned char RegParmMax, SSERegParmMax; TargetCXXABI TheCXXABI; const LangASMap *AddrSpaceMap; - const unsigned *GridValues = - nullptr; // Array of target-specific GPU grid values that must be + const llvm::omp::GV *GridValues = + nullptr; // target-specific GPU grid values that must be // consistent between host RTL (plugin), device RTL, and clang. mutable StringRef PlatformName; @@ -1410,10 +1410,10 @@ public: return LangAS::Default; } - /// Return a target-specific GPU grid value based on the GVIDX enum \p gv - unsigned getGridValue(llvm::omp::GVIDX gv) const { + /// Return a target-specific GPU grid values + const llvm::omp::GV &getGridValue() const { assert(GridValues != nullptr && "GridValues not initialized"); - return GridValues[gv]; + return *GridValues; } /// Retrieve the name of the platform as it is used in the diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp index fac786d..cebb19e 100644 --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -335,7 +335,7 @@ AMDGPUTargetInfo::AMDGPUTargetInfo(const llvm::Triple &Triple, llvm::AMDGPU::getArchAttrR600(GPUKind)) { resetDataLayout(isAMDGCN(getTriple()) ? DataLayoutStringAMDGCN : DataLayoutStringR600); - GridValues = llvm::omp::AMDGPUGpuGridValues; + GridValues = &llvm::omp::AMDGPUGridValues; setAddressSpaceMap(Triple.getOS() == llvm::Triple::Mesa3D || !isAMDGCN(Triple)); diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 56f8a17..d1a34e4 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -65,7 +65,7 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple, TLSSupported = false; VLASupported = false; AddrSpaceMap = &NVPTXAddrSpaceMap; - GridValues = llvm::omp::NVPTXGpuGridValues; + GridValues = &llvm::omp::NVPTXGridValues; UseAddrSpaceMapMangling = true; // Define available target features diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp index 33d4ab8..cac5faa 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp @@ -20,6 +20,7 @@ #include "clang/AST/StmtVisitor.h" #include "clang/Basic/Cuda.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" #include "llvm/IR/IntrinsicsAMDGPU.h" using namespace clang; @@ -35,7 +36,7 @@ CGOpenMPRuntimeAMDGCN::CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM) llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; // return constant compile-time target-specific warp size - unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); + unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size; return Bld.getInt32(WarpSize); } diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 63feced..b13d559 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -339,7 +339,7 @@ class CheckVarsEscapingDeclContext final assert(!GlobalizedRD && "Record for globalized variables is built already."); ArrayRef EscapedDeclsForParallel, EscapedDeclsForTeams; - unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); + unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size; if (IsInTTDRegion) EscapedDeclsForTeams = EscapedDecls.getArrayRef(); else @@ -535,8 +535,7 @@ public: /// on the NVPTX device, to generate more efficient code. static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; - unsigned LaneIDBits = - CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2); + unsigned LaneIDBits = CGF.getTarget().getGridValue().GV_Warp_Size_Log2; auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id"); } @@ -546,8 +545,8 @@ static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) { /// on the NVPTX device, to generate more efficient code. static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; - unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue( - llvm::omp::GV_Warp_Size_Log2_Mask); + unsigned LaneIDMask = + CGF.getContext().getTargetInfo().getGridValue().GV_Warp_Size_Log2_Mask; auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask), "nvptx_lane_id"); @@ -1308,7 +1307,7 @@ llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction( const RecordDecl *GlobalizedRD = nullptr; llvm::SmallVector LastPrivatesReductions; llvm::SmallDenseMap MappedDeclsFields; - unsigned WarpSize = CGM.getTarget().getGridValue(llvm::omp::GV_Warp_Size); + unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size; // Globalize team reductions variable unconditionally in all modes. if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); @@ -2089,7 +2088,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, "__openmp_nvptx_data_transfer_temporary_storage"; llvm::GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName); - unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); + unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size; if (!TransferMedium) { auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize); unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared); diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h index b5f1b84..5d3b711 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -17,7 +17,6 @@ #include "CGOpenMPRuntime.h" #include "CodeGenFunction.h" #include "clang/AST/StmtOpenMP.h" -#include "llvm/Frontend/OpenMP/OMPGridValues.h" namespace clang { namespace CodeGen { diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h index 0b6aed1..1d7735e 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h @@ -29,68 +29,69 @@ namespace omp { /// /// Example usage in clang: /// const unsigned slot_size = -/// ctx.GetTargetInfo().getGridValue(llvm::omp::GVIDX::GV_Warp_Size); +/// ctx.GetTargetInfo().getGridValue().GV_Warp_Size; /// /// Example usage in libomptarget/deviceRTLs: /// #include "llvm/Frontend/OpenMP/OMPGridValues.h" /// #ifdef __AMDGPU__ -/// #define GRIDVAL AMDGPUGpuGridValues +/// #define GRIDVAL AMDGPUGridValues /// #else -/// #define GRIDVAL NVPTXGpuGridValues +/// #define GRIDVAL NVPTXGridValues /// #endif /// ... Then use this reference for GV_Warp_Size in the deviceRTL source. -/// llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size] +/// llvm::omp::GRIDVAL().GV_Warp_Size /// /// Example usage in libomptarget hsa plugin: /// #include "llvm/Frontend/OpenMP/OMPGridValues.h" -/// #define GRIDVAL AMDGPUGpuGridValues +/// #define GRIDVAL AMDGPUGridValues /// ... Then use this reference to access GV_Warp_Size in the hsa plugin. -/// llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size] +/// llvm::omp::GRIDVAL().GV_Warp_Size /// /// Example usage in libomptarget cuda plugin: /// #include "llvm/Frontend/OpenMP/OMPGridValues.h" -/// #define GRIDVAL NVPTXGpuGridValues +/// #define GRIDVAL NVPTXGridValues /// ... Then use this reference to access GV_Warp_Size in the cuda plugin. -/// llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size] +/// llvm::omp::GRIDVAL().GV_Warp_Size /// -enum GVIDX { + +struct GV { /// The maximum number of workers in a kernel. /// (THREAD_ABSOLUTE_LIMIT) - (GV_Warp_Size), might be issue for blockDim.z - GV_Threads, + const unsigned GV_Threads; /// The size reserved for data in a shared memory slot. - GV_Slot_Size, + const unsigned GV_Slot_Size; /// The default value of maximum number of threads in a worker warp. - GV_Warp_Size, + const unsigned GV_Warp_Size; /// Alternate warp size for some AMDGCN architectures. Same as GV_Warp_Size /// for NVPTX. - GV_Warp_Size_32, + const unsigned GV_Warp_Size_32; /// The number of bits required to represent the max number of threads in warp - GV_Warp_Size_Log2, + const unsigned GV_Warp_Size_Log2; /// GV_Warp_Size * GV_Slot_Size, - GV_Warp_Slot_Size, + const unsigned GV_Warp_Slot_Size; /// the maximum number of teams. - GV_Max_Teams, + const unsigned GV_Max_Teams; /// Global Memory Alignment - GV_Mem_Align, + const unsigned GV_Mem_Align; /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2)) - GV_Warp_Size_Log2_Mask, + const unsigned GV_Warp_Size_Log2_Mask; // An alternative to the heavy data sharing infrastructure that uses global // memory is one that uses device __shared__ memory. The amount of such space // (in bytes) reserved by the OpenMP runtime is noted here. - GV_SimpleBufferSize, + const unsigned GV_SimpleBufferSize; // The absolute maximum team size for a working group - GV_Max_WG_Size, + const unsigned GV_Max_WG_Size; // The default maximum team size for a working group - GV_Default_WG_Size, + const unsigned GV_Default_WG_Size; // This is GV_Max_WG_Size / GV_WarpSize. 32 for NVPTX and 16 for AMDGCN. - GV_Max_Warp_Number, + const unsigned GV_Max_Warp_Number; /// The slot size that should be reserved for a working warp. /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2)) - GV_Warp_Size_Log2_MaskL + const unsigned GV_Warp_Size_Log2_MaskL; }; /// For AMDGPU GPUs -static constexpr unsigned AMDGPUGpuGridValues[] = { +static constexpr GV AMDGPUGridValues = { 448, // GV_Threads 256, // GV_Slot_Size 64, // GV_Warp_Size @@ -108,7 +109,7 @@ static constexpr unsigned AMDGPUGpuGridValues[] = { }; /// For Nvidia GPUs -static constexpr unsigned NVPTXGpuGridValues[] = { +static constexpr GV NVPTXGridValues = { 992, // GV_Threads 256, // GV_Slot_Size 32, // GV_Warp_Size diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 4865ef6..31adc72 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -501,14 +501,11 @@ public: static const unsigned HardTeamLimit = (1 << 16) - 1; // 64K needed to fit in uint16 static const int DefaultNumTeams = 128; - static const int Max_Teams = - llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Teams]; - static const int Warp_Size = - llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]; - static const int Max_WG_Size = - llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_WG_Size]; + static const int Max_Teams = llvm::omp::AMDGPUGridValues.GV_Max_Teams; + static const int Warp_Size = llvm::omp::AMDGPUGridValues.GV_Warp_Size; + static const int Max_WG_Size = llvm::omp::AMDGPUGridValues.GV_Max_WG_Size; static const int Default_WG_Size = - llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size]; + llvm::omp::AMDGPUGridValues.GV_Default_WG_Size; using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, const void *, size_t size, hsa_agent_t); @@ -1058,9 +1055,8 @@ int32_t __tgt_rtl_init_device(int device_id) { DeviceInfo.WarpSize[device_id] = wavefront_size; } else { DP("Default wavefront size: %d\n", - llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]); - DeviceInfo.WarpSize[device_id] = - llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]; + llvm::omp::AMDGPUGridValues.GV_Warp_Size); + DeviceInfo.WarpSize[device_id] = llvm::omp::AMDGPUGridValues.GV_Warp_Size; } // Adjust teams to the env variables -- 2.7.4