From 6a1d1f7eefe81faa1f7c6c47e8b9da0bfeb8c2e8 Mon Sep 17 00:00:00 2001 From: Dhruva Chakrabarti Date: Fri, 30 Jun 2023 14:02:12 -0400 Subject: [PATCH] [OpenMP] Added memory scope to atomic::inc API and used the device scope in reduction. With https://reviews.llvm.org/D137524, memory scope and ordering attributes are being used to generate the required instructions for atomic inc/dec on AMDGPU. This patch adds the memory scope attribute to the atomic::inc API and uses the device scope in reduction. Without the device scope in atomic_inc, the default system scope leads to unnecessary L2 write-backs/invalidates. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D154172 --- .../DeviceRTL/include/Synchronization.h | 9 +++- openmp/libomptarget/DeviceRTL/src/Reduction.cpp | 3 +- .../libomptarget/DeviceRTL/src/Synchronization.cpp | 51 ++++++++++++++-------- 3 files changed, 43 insertions(+), 20 deletions(-) diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h index 130578e..b31238f 100644 --- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h +++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h @@ -26,8 +26,15 @@ enum OrderingTy { seq_cst = __ATOMIC_SEQ_CST, }; +enum MemScopeTy { + all, // All threads on all devices + device, // All threads on the device + cgroup // All threads in the contention group, e.g. the team +}; + /// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. -uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering); +uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::all); /// Atomically perform on \p V and \p *Addr with \p Ordering semantics. The /// result is stored in \p *Addr; diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp index db40185..f544928 100644 --- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp @@ -228,7 +228,8 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2( // Increment team counter. // This counter is incremented by all teams in the current // BUFFER_SIZE chunk. - ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst); + ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst, + atomic::MemScopeTy::device); } // Synchronize if (mapping::isSPMDMode()) diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp index 5325b9f..550d961 100644 --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -29,8 +29,8 @@ namespace impl { /// ///{ /// NOTE: This function needs to be implemented by every target. -uint32_t atomicInc(uint32_t *Address, uint32_t Val, - atomic::OrderingTy Ordering); +uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope); template Ty atomicAdd(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { @@ -116,7 +116,8 @@ uint32_t atomicExchange(uint32_t *Address, uint32_t Val, ///} // Forward declarations defined to be defined for AMDGCN and NVPTX. -uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering); +uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope); void namedBarrierInit(); void namedBarrier(); void fenceTeam(atomic::OrderingTy Ordering); @@ -138,22 +139,35 @@ void setCriticalLock(omp_lock_t *); ///{ #pragma omp begin declare variant match(device = {arch(amdgcn)}) -uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering) { +uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope) { // builtin_amdgcn_atomic_inc32 should expand to this switch when // passed a runtime value, but does not do so yet. Workaround here. + +#define ScopeSwitch(ORDER) \ + switch (MemScope) { \ + case atomic::MemScopeTy::all: \ + return __builtin_amdgcn_atomic_inc32(A, V, ORDER, ""); \ + case atomic::MemScopeTy::device: \ + return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "agent"); \ + case atomic::MemScopeTy::cgroup: \ + return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "workgroup"); \ + } + +#define Case(ORDER) \ + case ORDER: \ + ScopeSwitch(ORDER) + switch (Ordering) { default: __builtin_unreachable(); - case atomic::relaxed: - return __builtin_amdgcn_atomic_inc32(A, V, atomic::relaxed, ""); - case atomic::aquire: - return __builtin_amdgcn_atomic_inc32(A, V, atomic::aquire, ""); - case atomic::release: - return __builtin_amdgcn_atomic_inc32(A, V, atomic::release, ""); - case atomic::acq_rel: - return __builtin_amdgcn_atomic_inc32(A, V, atomic::acq_rel, ""); - case atomic::seq_cst: - return __builtin_amdgcn_atomic_inc32(A, V, atomic::seq_cst, ""); + Case(atomic::relaxed); + Case(atomic::aquire); + Case(atomic::release); + Case(atomic::acq_rel); + Case(atomic::seq_cst); +#undef Case +#undef ScopeSwitch } } @@ -308,8 +322,8 @@ void setCriticalLock(omp_lock_t *Lock) { device = {arch(nvptx, nvptx64)}, \ implementation = {extension(match_any)}) -uint32_t atomicInc(uint32_t *Address, uint32_t Val, - atomic::OrderingTy Ordering) { +uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope) { return __nvvm_atom_inc_gen_ui(Address, Val); } @@ -480,8 +494,9 @@ ATOMIC_FP_OP(double, int64_t, uint64_t) #undef ATOMIC_INT_OP #undef ATOMIC_FP_OP -uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) { - return impl::atomicInc(Addr, V, Ordering); +uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope) { + return impl::atomicInc(Addr, V, Ordering, MemScope); } void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); } -- 2.7.4