From: Johannes Doerfert Date: Sun, 2 Oct 2022 16:34:41 +0000 (-0700) Subject: [OpenMP] Introduce more atomic operations into the runtime X-Git-Tag: upstream/17.0.6~31575 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b113965073596347fa4eb6819e0ddf624b03a808;p=platform%2Fupstream%2Fllvm.git [OpenMP] Introduce more atomic operations into the runtime We should use OpenMP atomics but they don't take variable orderings. Maybe we should expose all of this in the header but that solves only part of the problem anyway. Differential Revision: https://reviews.llvm.org/D135036 --- diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h index 438b022..0740a65 100644 --- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h +++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h @@ -54,20 +54,60 @@ enum OrderingTy { seq_cst = __ATOMIC_SEQ_CST, }; -/// Atomically load \p Addr with \p Ordering semantics. -uint32_t load(uint32_t *Addr, atomic::OrderingTy Ordering); - -/// Atomically store \p V to \p Addr with \p Ordering semantics. -void store(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering); - /// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. -uint32_t inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering); +uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering); + +/// Atomically perform on \p V and \p *Addr with \p Ordering semantics. The +/// result is stored in \p *Addr; +/// { + +#define ATOMIC_COMMON_OP(TY) \ + TY add(TY *Addr, TY V, OrderingTy Ordering); \ + TY mul(TY *Addr, TY V, OrderingTy Ordering); \ + TY load(TY *Addr, OrderingTy Ordering); \ + void store(TY *Addr, TY V, OrderingTy Ordering); \ + bool cas(TY *Addr, TY ExpectedV, TY DesiredV, OrderingTy OrderingSucc, \ + OrderingTy OrderingFail); + +#define ATOMIC_FP_ONLY_OP(TY) \ + TY min(TY *Addr, TY V, OrderingTy Ordering); \ + TY max(TY *Addr, TY V, OrderingTy Ordering); + +#define ATOMIC_INT_ONLY_OP(TY) \ + TY min(TY *Addr, TY V, OrderingTy Ordering); \ + TY max(TY *Addr, TY V, OrderingTy Ordering); \ + TY bit_or(TY *Addr, TY V, OrderingTy Ordering); \ + TY bit_and(TY *Addr, TY V, OrderingTy Ordering); \ + TY bit_xor(TY *Addr, TY V, OrderingTy Ordering); + +#define ATOMIC_FP_OP(TY) \ + ATOMIC_FP_ONLY_OP(TY) \ + ATOMIC_COMMON_OP(TY) + +#define ATOMIC_INT_OP(TY) \ + ATOMIC_INT_ONLY_OP(TY) \ + ATOMIC_COMMON_OP(TY) + +// This needs to be kept in sync with the header. Also the reason we don't use +// templates here. +ATOMIC_INT_OP(int8_t) +ATOMIC_INT_OP(int16_t) +ATOMIC_INT_OP(int32_t) +ATOMIC_INT_OP(int64_t) +ATOMIC_INT_OP(uint8_t) +ATOMIC_INT_OP(uint16_t) +ATOMIC_INT_OP(uint32_t) +ATOMIC_INT_OP(uint64_t) +ATOMIC_FP_OP(float) +ATOMIC_FP_OP(double) + +#undef ATOMIC_INT_ONLY_OP +#undef ATOMIC_FP_ONLY_OP +#undef ATOMIC_COMMON_OP +#undef ATOMIC_INT_OP +#undef ATOMIC_FP_OP -/// Atomically add \p V to \p *Addr with \p Ordering semantics. -uint32_t add(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering); - -/// Atomically add \p V to \p *Addr with \p Ordering semantics. -uint64_t add(uint64_t *Addr, uint64_t V, atomic::OrderingTy Ordering); +///} } // namespace atomic diff --git a/openmp/libomptarget/DeviceRTL/include/Utils.h b/openmp/libomptarget/DeviceRTL/include/Utils.h index 178c001..84ffea6 100644 --- a/openmp/libomptarget/DeviceRTL/include/Utils.h +++ b/openmp/libomptarget/DeviceRTL/include/Utils.h @@ -77,6 +77,11 @@ template inline Ty1 align_down(Ty1 V, Ty2 Align) { /// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)). bool isSharedMemPtr(void *Ptr); +/// Return \p V typed punned as \p DstTy. +template inline DstTy convertViaPun(SrcTy V) { + return *((DstTy *)(&V)); +} + /// A pointer variable that has by design an `undef` value. Use with care. __attribute__((loader_uninitialized)) static void *const UndefPtr; diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp index a155641..85d75bf 100644 --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -32,40 +32,87 @@ namespace impl { uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering); -uint32_t atomicLoad(uint32_t *Address, atomic::OrderingTy Ordering) { - return __atomic_fetch_add(Address, 0U, Ordering); +template +Ty atomicAdd(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { + return __atomic_fetch_add(Address, Val, Ordering); +} + +template +Ty atomicMul(Ty *Address, Ty V, atomic::OrderingTy Ordering) { + Ty TypedCurrentVal, TypedResultVal, TypedNewVal; + bool Success; + do { + TypedCurrentVal = atomic::load(Address, Ordering); + TypedNewVal = TypedCurrentVal * V; + Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering, + atomic::relaxed); + } while (!Success); + return TypedResultVal; +} + +template Ty atomicLoad(Ty *Address, atomic::OrderingTy Ordering) { + return atomicAdd(Address, Ty(0), Ordering); } -void atomicStore(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering) { +template +void atomicStore(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { __atomic_store_n(Address, Val, Ordering); } -uint32_t atomicAdd(uint32_t *Address, uint32_t Val, - atomic::OrderingTy Ordering) { - return __atomic_fetch_add(Address, Val, Ordering); +template +bool atomicCAS(Ty *Address, Ty ExpectedV, Ty DesiredV, + atomic::OrderingTy OrderingSucc, + atomic::OrderingTy OrderingFail) { + return __atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false, + OrderingSucc, OrderingFail); } -uint32_t atomicMax(uint32_t *Address, uint32_t Val, - atomic::OrderingTy Ordering) { + +template +Ty atomicMin(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { + return __atomic_fetch_min(Address, Val, Ordering); +} + +template +Ty atomicMax(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { return __atomic_fetch_max(Address, Val, Ordering); } +// TODO: Implement this with __atomic_fetch_max and remove the duplication. +template +Ty atomicMinFP(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { + if (Val >= 0) + return atomicMin((STy *)Address, utils::convertViaPun(Val), Ordering); + return atomicMax((UTy *)Address, utils::convertViaPun(Val), Ordering); +} + +template +Ty atomicMaxFP(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { + if (Val >= 0) + return atomicMax((STy *)Address, utils::convertViaPun(Val), Ordering); + return atomicMin((UTy *)Address, utils::convertViaPun(Val), Ordering); +} + +template +Ty atomicOr(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { + return __atomic_fetch_or(Address, Val, Ordering); +} + +template +Ty atomicAnd(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { + return __atomic_fetch_and(Address, Val, Ordering); +} + +template +Ty atomicXOr(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { + return __atomic_fetch_xor(Address, Val, Ordering); +} + uint32_t atomicExchange(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering) { uint32_t R; __atomic_exchange(Address, &Val, &R, Ordering); return R; } -uint32_t atomicCAS(uint32_t *Address, uint32_t Compare, uint32_t Val, - atomic::OrderingTy Ordering) { - (void)__atomic_compare_exchange(Address, &Compare, &Val, false, Ordering, - Ordering); - return Compare; -} - -uint64_t atomicAdd(uint64_t *Address, uint64_t Val, - atomic::OrderingTy Ordering) { - return __atomic_fetch_add(Address, Val, Ordering); -} ///} // Forward declarations defined to be defined for AMDGCN and NVPTX. @@ -287,7 +334,8 @@ void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); } void setLock(omp_lock_t *Lock) { // TODO: not sure spinning is a good idea here.. - while (atomicCAS((uint32_t *)Lock, UNSET, SET, atomic::seq_cst) != UNSET) { + while (atomicCAS((uint32_t *)Lock, UNSET, SET, atomic::seq_cst, + atomic::seq_cst) != UNSET) { int32_t start = __nvvm_read_ptx_sreg_clock(); int32_t now; for (;;) { @@ -322,24 +370,84 @@ void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); } void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); } -uint32_t atomic::load(uint32_t *Addr, atomic::OrderingTy Ordering) { - return impl::atomicLoad(Addr, Ordering); -} +#define ATOMIC_COMMON_OP(TY) \ + TY atomic::add(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicAdd(Addr, V, Ordering); \ + } \ + TY atomic::mul(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicMul(Addr, V, Ordering); \ + } \ + TY atomic::load(TY *Addr, atomic::OrderingTy Ordering) { \ + return impl::atomicLoad(Addr, Ordering); \ + } \ + bool atomic::cas(TY *Addr, TY ExpectedV, TY DesiredV, \ + atomic::OrderingTy OrderingSucc, \ + atomic::OrderingTy OrderingFail) { \ + return impl::atomicCAS(Addr, ExpectedV, DesiredV, OrderingSucc, \ + OrderingFail); \ + } -void atomic::store(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) { - impl::atomicStore(Addr, V, Ordering); -} +#define ATOMIC_FP_ONLY_OP(TY, STY, UTY) \ + TY atomic::min(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicMinFP(Addr, V, Ordering); \ + } \ + TY atomic::max(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicMaxFP(Addr, V, Ordering); \ + } \ + void atomic::store(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + impl::atomicStore(reinterpret_cast(Addr), \ + utils::convertViaPun(V), Ordering); \ + } -uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) { - return impl::atomicInc(Addr, V, Ordering); -} +#define ATOMIC_INT_ONLY_OP(TY) \ + TY atomic::min(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicMin(Addr, V, Ordering); \ + } \ + TY atomic::max(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicMax(Addr, V, Ordering); \ + } \ + TY atomic::bit_or(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicOr(Addr, V, Ordering); \ + } \ + TY atomic::bit_and(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicAnd(Addr, V, Ordering); \ + } \ + TY atomic::bit_xor(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + return impl::atomicXOr(Addr, V, Ordering); \ + } \ + void atomic::store(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ + impl::atomicStore(Addr, V, Ordering); \ + } -uint32_t atomic::add(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) { - return impl::atomicAdd(Addr, V, Ordering); -} +#define ATOMIC_FP_OP(TY, STY, UTY) \ + ATOMIC_FP_ONLY_OP(TY, STY, UTY) \ + ATOMIC_COMMON_OP(TY) + +#define ATOMIC_INT_OP(TY) \ + ATOMIC_INT_ONLY_OP(TY) \ + ATOMIC_COMMON_OP(TY) + +// This needs to be kept in sync with the header. Also the reason we don't use +// templates here. +ATOMIC_INT_OP(int8_t) +ATOMIC_INT_OP(int16_t) +ATOMIC_INT_OP(int32_t) +ATOMIC_INT_OP(int64_t) +ATOMIC_INT_OP(uint8_t) +ATOMIC_INT_OP(uint16_t) +ATOMIC_INT_OP(uint32_t) +ATOMIC_INT_OP(uint64_t) +ATOMIC_FP_OP(float, int32_t, uint32_t) +ATOMIC_FP_OP(double, int64_t, uint64_t) + +#undef ATOMIC_INT_ONLY_OP +#undef ATOMIC_FP_ONLY_OP +#undef ATOMIC_COMMON_OP +#undef ATOMIC_INT_OP +#undef ATOMIC_FP_OP -uint64_t atomic::add(uint64_t *Addr, uint64_t V, atomic::OrderingTy Ordering) { - return impl::atomicAdd(Addr, V, Ordering); +uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) { + return impl::atomicInc(Addr, V, Ordering); } extern "C" {