From 918a1065be642a3d5f804c95d7971c2d1b96cdf5 Mon Sep 17 00:00:00 2001 From: Jon Chesterfield Date: Thu, 5 Mar 2020 17:05:56 +0000 Subject: [PATCH] [libomptarget][nfc] Move GetWarp/LaneId functions into per arch code Summary: [libomptarget][nfc] Move GetWarp/LaneId functions into per arch code No code change for nvptx. Amdgcn currently has two implementations of GetLaneId, this patch keeps the one a colleague considered to be superior for our ISA. GetWarpId is currently the same function for amdgcn and nvptx, but I think it's cleaner to keep it grouped with all the others than to keep it in support.cu. Reviewers: jdoerfert, grokos, ABataev Reviewed By: jdoerfert Subscribers: jvesely, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D75587 --- openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h | 2 ++ .../libomptarget/deviceRTLs/amdgcn/src/target_impl.hip | 16 ++++++++-------- openmp/libomptarget/deviceRTLs/common/src/support.cu | 10 ---------- openmp/libomptarget/deviceRTLs/common/support.h | 4 ---- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h | 2 ++ 5 files changed, 12 insertions(+), 22 deletions(-) diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h index b3c8923..c2d5e32 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -127,6 +127,8 @@ INLINE int GetThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); } INLINE int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); } DEVICE int GetNumberOfBlocksInKernel(); DEVICE int GetNumberOfThreadsInBlock(); +DEVICE unsigned GetWarpId(); +DEVICE unsigned GetLaneId(); DEVICE bool __kmpc_impl_is_first_active_thread(); diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip index a32bcd8..9807483 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -14,14 +14,10 @@ // Implementations initially derived from hcc -static DEVICE uint32_t getLaneId(void) { - return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); -} - // Initialized with a 64-bit mask with bits set in positions less than the // thread's lane number in the warp DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() { - uint32_t lane = getLaneId(); + uint32_t lane = GetLaneId(); int64_t ballot = __kmpc_impl_activemask(); uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1; return mask & ballot; @@ -30,7 +26,7 @@ DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() { // Initialized with a 64-bit mask with bits set in positions greater than the // thread's lane number in the warp DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { - uint32_t lane = getLaneId(); + uint32_t lane = GetLaneId(); if (lane == (WARPSIZE - 1)) return 0; uint64_t ballot = __kmpc_impl_activemask(); @@ -53,14 +49,14 @@ DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var, int32_t srcLane) { int width = WARPSIZE; - int self = getLaneId(); + int self = GetLaneId(); int index = srcLane + (self & ~(width - 1)); return __builtin_amdgcn_ds_bpermute(index << 2, var); } DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var, uint32_t laneDelta, int32_t width) { - int self = getLaneId(); + int self = GetLaneId(); int index = self + laneDelta; index = (int)(laneDelta + (self & (width - 1))) >= width ? self : index; return __builtin_amdgcn_ds_bpermute(index << 2, var); @@ -70,3 +66,7 @@ EXTERN uint64_t __ockl_get_local_size(uint32_t); EXTERN uint64_t __ockl_get_num_groups(uint32_t); DEVICE int GetNumberOfBlocksInKernel() { return __ockl_get_num_groups(0); } DEVICE int GetNumberOfThreadsInBlock() { return __ockl_get_local_size(0); } +DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } +DEVICE unsigned GetLaneId() { + return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); +} diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu index d7a0b23..8574751 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu @@ -94,16 +94,6 @@ DEVICE bool checkRuntimeInitialized(kmp_Ident *loc) { //////////////////////////////////////////////////////////////////////////////// // -// Calls to the NVPTX layer (assuming 1D layout) -// -//////////////////////////////////////////////////////////////////////////////// - -DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } - -DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } - -//////////////////////////////////////////////////////////////////////////////// -// // Calls to the Generic Scheme Implementation Layer (assuming 1D layout) // //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h index d347284..913c4c3 100644 --- a/openmp/libomptarget/deviceRTLs/common/support.h +++ b/openmp/libomptarget/deviceRTLs/common/support.h @@ -50,10 +50,6 @@ DEVICE bool checkRuntimeInitialized(kmp_Ident *loc); // get info from machine //////////////////////////////////////////////////////////////////////////////// -// get low level ids of resources -DEVICE unsigned GetWarpId(); -DEVICE unsigned GetLaneId(); - // get global ids to locate tread/team info (constant regardless of OMP) DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode); DEVICE int GetMasterThreadID(); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h index 4741ce8..04d510b 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -189,6 +189,8 @@ INLINE int GetThreadIdInBlock() { return threadIdx.x; } INLINE int GetBlockIdInKernel() { return blockIdx.x; } INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; } INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; } +INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } +INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } // Return true if this is the first active thread in the warp. INLINE bool __kmpc_impl_is_first_active_thread() { -- 2.7.4