From ea616f9026dc6bd9c67ebe2d3226ac91122a7945 Mon Sep 17 00:00:00 2001 From: Jon Chesterfield Date: Wed, 20 Jan 2021 19:45:05 +0000 Subject: [PATCH] [libomptarget][devicertl][nfc] Remove some cuda intrinsics, simplify [libomptarget][devicertl][nfc] Remove some cuda intrinsics, simplify Replace __popc, __ffs with clang intrinsics. Move kmpc_impl_min to only file that uses it and replace template with explictly typed. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D95060 --- openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h | 17 ++++------------- openmp/libomptarget/deviceRTLs/common/src/reduction.cu | 8 +++++--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h | 9 ++------- 3 files changed, 11 insertions(+), 23 deletions(-) diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h index d25ea85..b1e9a1a 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -65,6 +65,10 @@ enum DATA_SHARING_SIZES { DS_Max_Warp_Number = 16, }; +enum : __kmpc_impl_lanemask_t { + __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0 +}; + INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF)); hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32); @@ -74,28 +78,15 @@ INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) { return (((uint64_t)hi) << 32) | (uint64_t)lo; } -enum : __kmpc_impl_lanemask_t { - __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0 -}; - DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt(); - DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt(); - DEVICE uint32_t __kmpc_impl_smid(); - DEVICE double __kmpc_impl_get_wtick(); - DEVICE double __kmpc_impl_get_wtime(); INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); } - INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); } -template INLINE T __kmpc_impl_min(T x, T y) { - return x < y ? x : y; -} - DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask(); DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var, diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu index 92b34d7..3a3c445 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu @@ -184,6 +184,8 @@ INLINE static uint32_t roundToWarpsize(uint32_t s) { return (s & ~(unsigned)(WARPSIZE - 1)); } +INLINE static uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } + DEVICE static volatile uint32_t IterCnt = 0; DEVICE static volatile uint32_t Cnt = 0; EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( @@ -261,14 +263,14 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( // by returning 1 in the thread holding the reduction result. // Check if this is the very last team. - unsigned NumRecs = __kmpc_impl_min(NumTeams, uint32_t(num_of_records)); + unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records)); if (ChunkTeamCount == NumTeams - Bound - 1) { // // Last team processing. // if (ThreadId >= NumRecs) return 0; - NumThreads = roundToWarpsize(__kmpc_impl_min(NumThreads, NumRecs)); + NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs)); if (ThreadId >= NumThreads) return 0; @@ -283,7 +285,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( // When we have more than [warpsize] number of threads // a block reduction is performed here. - uint32_t ActiveThreads = __kmpc_impl_min(NumRecs, NumThreads); + uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads); if (ActiveThreads > WARPSIZE) { uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; // Gather all the reduced values from each warp diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h index 8382cd6..ab9fd16 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -93,13 +93,8 @@ DEVICE uint32_t __kmpc_impl_smid(); DEVICE double __kmpc_impl_get_wtick(); DEVICE double __kmpc_impl_get_wtime(); -INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); } - -INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); } - -template INLINE T __kmpc_impl_min(T x, T y) { - return min(x, y); -} +INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); } +INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); } #ifndef CUDA_VERSION #error CUDA_VERSION macro is undefined, something wrong with cuda. -- 2.7.4