From 5a682d9b9109494cb46e16b493cf4afc5e25e598 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 21 Jul 2021 00:18:38 -0700 Subject: [PATCH] [OpenMP] Expose libomptarget function to get HW thread id The patch exposes the libomptarget runtime function that gets the hardware thread id through the kmpc API. This is to be used in SPMDization for checking the thread id to execute regions by a single thread in a block. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D106323 --- llvm/include/llvm/Frontend/OpenMP/OMPKinds.def | 1 + .../deviceRTLs/amdgcn/src/target_impl.hip | 4 ++-- openmp/libomptarget/deviceRTLs/common/debug.h | 10 ++++++---- openmp/libomptarget/deviceRTLs/common/omptargeti.h | 21 +++++++++++---------- .../deviceRTLs/common/src/data_sharing.cu | 18 +++++++++++------- .../libomptarget/deviceRTLs/common/src/libcall.cu | 2 +- .../libomptarget/deviceRTLs/common/src/omptarget.cu | 10 +++++----- .../libomptarget/deviceRTLs/common/src/parallel.cu | 7 ++++--- .../libomptarget/deviceRTLs/common/src/reduction.cu | 18 ++++++++++-------- .../libomptarget/deviceRTLs/common/src/support.cu | 4 ++-- .../deviceRTLs/nvptx/src/target_impl.cu | 14 ++++++++++---- openmp/libomptarget/deviceRTLs/target_interface.h | 2 +- 12 files changed, 64 insertions(+), 47 deletions(-) diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index f227fc6..93ab3f6 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -195,6 +195,7 @@ __OMP_RTL(__kmpc_cancel, false, Int32, IdentPtr, Int32, Int32) __OMP_RTL(__kmpc_cancel_barrier, false, Int32, IdentPtr, Int32) __OMP_RTL(__kmpc_flush, false, Void, IdentPtr) __OMP_RTL(__kmpc_global_thread_num, false, Int32, IdentPtr) +__OMP_RTL(__kmpc_get_hardware_thread_id_in_block, false, Int32, ) __OMP_RTL(__kmpc_fork_call, true, Void, IdentPtr, Int32, ParallelTaskPtr) __OMP_RTL(__kmpc_omp_taskwait, false, Int32, IdentPtr, Int32) __OMP_RTL(__kmpc_omp_taskyield, false, Int32, IdentPtr, Int32, /* Int */ Int32) diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip index 8398b4d..2bbeab7 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -133,7 +133,7 @@ EXTERN int GetNumberOfThreadsInBlock() { __builtin_amdgcn_workgroup_size_x()); } -EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } +EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; } EXTERN unsigned GetWarpSize() { return WARPSIZE; } EXTERN unsigned GetLaneId() { return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); @@ -212,7 +212,7 @@ EXTERN void __kmpc_impl_threadfence_system() { } // Calls to the AMDGCN layer (assuming 1D layout) -EXTERN int GetThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); } +EXTERN int __kmpc_get_hardware_thread_id_in_block() { return __builtin_amdgcn_workitem_id_x(); } EXTERN int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); } #pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h index 3b2895e..99c9b6c 100644 --- a/openmp/libomptarget/deviceRTLs/common/debug.h +++ b/openmp/libomptarget/deviceRTLs/common/debug.h @@ -132,8 +132,9 @@ template NOINLINE static void log(const char *fmt, Arguments... parameters) { - printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(), - (int)GetWarpId(), (int)GetLaneId(), parameters...); + printf(fmt, (int)GetBlockIdInKernel(), + (int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(), + (int)GetLaneId(), parameters...); } #endif @@ -143,8 +144,9 @@ template NOINLINE static void check(bool cond, const char *fmt, Arguments... parameters) { if (!cond) { - printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(), - (int)GetWarpId(), (int)GetLaneId(), parameters...); + printf(fmt, (int)GetBlockIdInKernel(), + (int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(), + (int)GetLaneId(), parameters...); __builtin_trap(); } } diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h index 02feaf5..485e30c 100644 --- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h +++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h @@ -54,7 +54,8 @@ INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr( items.flags = TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel items.threadId = - GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) + __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only + // called for 1st level) items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1 prev = parentTaskDescr; } @@ -97,16 +98,16 @@ INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr( // // overwrite specific items; // - // The threadID should be GetThreadIdInBlock() % GetMasterThreadID(). - // This is so that the serial master (first lane in the master warp) - // gets a threadId of 0. - // However, we know that this function is always called in a parallel - // region where only workers are active. The serial master thread - // never enters this region. When a parallel region is executed serially, - // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions - // are called, which never activate this region. + // The threadID should be __kmpc_get_hardware_thread_id_in_block() % + // GetMasterThreadID(). This is so that the serial master (first lane in the + // master warp) gets a threadId of 0. However, we know that this function is + // always called in a parallel region where only workers are active. The + // serial master thread never enters this region. When a parallel region is + // executed serially, the threadId is set to 0 elsewhere and the + // kmpc_serialized_* functions are called, which never activate this region. items.threadId = - GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) + __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only + // called for 1st level) } INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent( diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu index 445e8c1..d8bb7ea 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu @@ -48,7 +48,8 @@ static void *__kmpc_alloc_for_warp(AllocTy Alloc, unsigned Bytes, void *Ptr; __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1; - bool IsWarpLeader = (GetThreadIdInBlock() % WARPSIZE) == LeaderID; + bool IsWarpLeader = + (__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID; if (IsWarpLeader) Ptr = Alloc(); // Get address from the first active lane. @@ -61,7 +62,7 @@ static void *__kmpc_alloc_for_warp(AllocTy Alloc, unsigned Bytes, EXTERN void *__kmpc_alloc_shared(size_t Bytes) { Bytes = Bytes + (Bytes % MinBytes); - int TID = GetThreadIdInBlock(); + int TID = __kmpc_get_hardware_thread_id_in_block(); if (__kmpc_is_generic_main_thread(TID)) { // Main thread alone, use shared memory if space is available. if (MainSharedStack.Usage[0] + Bytes <= MainSharedStack.MaxSize) { @@ -97,7 +98,8 @@ EXTERN void *__kmpc_alloc_shared(size_t Bytes) { EXTERN void __kmpc_free_shared(void *Ptr) { __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1; - bool IsWarpLeader = (GetThreadIdInBlock() % WARPSIZE) == LeaderID; + bool IsWarpLeader = + (__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID; __kmpc_syncwarp(CurActive); if (IsWarpLeader) { if (Ptr >= &MainSharedStack.Data[0] && @@ -190,13 +192,14 @@ EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, return; } if (isSPMDExecutionMode) { - if (GetThreadIdInBlock() == 0) { + if (__kmpc_get_hardware_thread_id_in_block() == 0) { *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); } __kmpc_impl_syncthreads(); return; } - ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), + ASSERT0(LT_FUSSY, + __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(), "Must be called only in the target master thread."); *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); __kmpc_impl_threadfence(); @@ -208,13 +211,14 @@ EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, return; if (isSPMDExecutionMode) { __kmpc_impl_syncthreads(); - if (GetThreadIdInBlock() == 0) { + if (__kmpc_get_hardware_thread_id_in_block() == 0) { omptarget_nvptx_simpleMemoryManager.Release(); } return; } __kmpc_impl_threadfence(); - ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), + ASSERT0(LT_FUSSY, + __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(), "Must be called only in the target master thread."); omptarget_nvptx_simpleMemoryManager.Release(); } diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu index 49d6d9f..9c62b5b 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu @@ -142,7 +142,7 @@ EXTERN int omp_get_active_level(void) { EXTERN int omp_get_ancestor_thread_num(int level) { if (__kmpc_is_spmd_exec_mode()) - return level == 1 ? GetThreadIdInBlock() : 0; + return level == 1 ? __kmpc_get_hardware_thread_id_in_block() : 0; int rc = -1; // If level is 0 or all parallel regions are not active - return 0. unsigned parLevel = parallelLevel[GetWarpId()]; diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu index 2160854..3b620de 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -34,7 +34,7 @@ static void __kmpc_generic_kernel_init() { if (GetLaneId() == 0) parallelLevel[GetWarpId()] = 0; - int threadIdInBlock = GetThreadIdInBlock(); + int threadIdInBlock = __kmpc_get_hardware_thread_id_in_block(); if (threadIdInBlock != GetMasterThreadID()) return; @@ -87,7 +87,7 @@ static void __kmpc_spmd_kernel_init(bool RequiresFullRuntime) { setExecutionParameters(Spmd, RequiresFullRuntime ? RuntimeInitialized : RuntimeUninitialized); - int threadId = GetThreadIdInBlock(); + int threadId = __kmpc_get_hardware_thread_id_in_block(); if (threadId == 0) { usedSlotIdx = __kmpc_impl_smid() % MAX_SM; } @@ -147,7 +147,7 @@ static void __kmpc_spmd_kernel_deinit(bool RequiresFullRuntime) { return; __kmpc_impl_syncthreads(); - int threadId = GetThreadIdInBlock(); + int threadId = __kmpc_get_hardware_thread_id_in_block(); if (threadId == 0) { // Enqueue omp state object for use by another team. int slot = usedSlotIdx; @@ -169,7 +169,7 @@ EXTERN bool __kmpc_kernel_parallel(void**WorkFn); static void __kmpc_target_region_state_machine(ident_t *Ident) { - int TId = GetThreadIdInBlock(); + int TId = __kmpc_get_hardware_thread_id_in_block(); do { void* WorkFn = 0; @@ -199,7 +199,7 @@ EXTERN int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD, bool UseGenericStateMachine, bool RequiresFullRuntime) { - int TId = GetThreadIdInBlock(); + int TId = __kmpc_get_hardware_thread_id_in_block(); if (IsSPMD) __kmpc_spmd_kernel_init(RequiresFullRuntime); else diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu index f30a5b5..8398163 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu @@ -105,7 +105,8 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn) { ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", (int)NumThreads); - ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), + ASSERT0(LT_FUSSY, + __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(), "only team master can create parallel"); // Set number of threads on work descriptor. @@ -133,7 +134,7 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn) { // Only the worker threads call this routine and the master warp // never arrives here. Therefore, use the nvptx thread id. - int threadId = GetThreadIdInBlock(); + int threadId = __kmpc_get_hardware_thread_id_in_block(); omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); // Set to true for workers participating in the parallel region. bool isActive = false; @@ -166,7 +167,7 @@ EXTERN void __kmpc_kernel_end_parallel() { // Only the worker threads call this routine and the master warp // never arrives here. Therefore, use the nvptx thread id. - int threadId = GetThreadIdInBlock(); + int threadId = __kmpc_get_hardware_thread_id_in_block(); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, currTaskDescr->GetPrevTaskDescr()); diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu index 6c02790..3a658f5 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu @@ -47,7 +47,7 @@ INLINE static void gpu_irregular_warp_reduce(void *reduce_data, INLINE static uint32_t gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) { uint32_t size, remote_id, physical_lane_id; - physical_lane_id = GetThreadIdInBlock() % WARPSIZE; + physical_lane_id = __kmpc_get_hardware_thread_id_in_block() % WARPSIZE; __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt(); __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2; @@ -95,9 +95,10 @@ static int32_t nvptx_parallel_reduce_nowait( if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1)) gpu_regular_warp_reduce(reduce_data, shflFct); else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. - gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/NumThreads % WARPSIZE, - /*LaneId=*/GetThreadIdInBlock() % WARPSIZE); + gpu_irregular_warp_reduce( + reduce_data, shflFct, + /*LaneCount=*/NumThreads % WARPSIZE, + /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE); // When we have more than [warpsize] number of threads // a block reduction is performed here. @@ -118,9 +119,10 @@ static int32_t nvptx_parallel_reduce_nowait( if (Liveness == __kmpc_impl_all_lanes) // Full warp gpu_regular_warp_reduce(reduce_data, shflFct); else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes - gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/__kmpc_impl_popc(Liveness), - /*LaneId=*/GetThreadIdInBlock() % WARPSIZE); + gpu_irregular_warp_reduce( + reduce_data, shflFct, + /*LaneCount=*/__kmpc_impl_popc(Liveness), + /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE); else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2 // parallel region may enter here; return // early. @@ -185,7 +187,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( // Terminate all threads in non-SPMD mode except for the master thread. if (!__kmpc_is_spmd_exec_mode() && - !__kmpc_is_generic_main_thread(GetThreadIdInBlock())) + !__kmpc_is_generic_main_thread(__kmpc_get_hardware_thread_id_in_block())) return 0; uint32_t ThreadId = GetLogicalThreadIdInBlock(); diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu index d78f94c..3d93ea0 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu @@ -70,7 +70,7 @@ int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); } int GetLogicalThreadIdInBlock() { // Implemented using control flow (predication) instead of with a modulo // operation. - int tid = GetThreadIdInBlock(); + int tid = __kmpc_get_hardware_thread_id_in_block(); if (__kmpc_is_generic_main_thread(tid)) return 0; else @@ -84,7 +84,7 @@ int GetLogicalThreadIdInBlock() { //////////////////////////////////////////////////////////////////////////////// int GetOmpThreadId() { - int tid = GetThreadIdInBlock(); + int tid = __kmpc_get_hardware_thread_id_in_block(); if (__kmpc_is_generic_main_thread(tid)) return 0; // omp_thread_num diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu index 35324f0..c1b4007 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -60,7 +60,7 @@ EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() { return Mask; } -EXTERN void __kmpc_impl_syncthreads() { +EXTERN void __kmpc_impl_syncthreads() { int barrier = 2; asm volatile("barrier.sync %0;" : @@ -92,15 +92,21 @@ EXTERN void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); } EXTERN void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); } // Calls to the NVPTX layer (assuming 1D layout) -EXTERN int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); } +EXTERN int __kmpc_get_hardware_thread_id_in_block() { + return __nvvm_read_ptx_sreg_tid_x(); +} EXTERN int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); } EXTERN int GetNumberOfBlocksInKernel() { return __nvvm_read_ptx_sreg_nctaid_x(); } EXTERN int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); } -EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } +EXTERN unsigned GetWarpId() { + return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; +} EXTERN unsigned GetWarpSize() { return WARPSIZE; } -EXTERN unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } +EXTERN unsigned GetLaneId() { + return __kmpc_get_hardware_thread_id_in_block() & (WARPSIZE - 1); +} // Atomics uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) { diff --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h index c5141c9..7e6ae0d 100644 --- a/openmp/libomptarget/deviceRTLs/target_interface.h +++ b/openmp/libomptarget/deviceRTLs/target_interface.h @@ -16,7 +16,7 @@ #include "target_impl.h" // Calls to the NVPTX layer (assuming 1D layout) -EXTERN int GetThreadIdInBlock(); +EXTERN int __kmpc_get_hardware_thread_id_in_block(); EXTERN int GetBlockIdInKernel(); EXTERN int GetNumberOfBlocksInKernel(); EXTERN int GetNumberOfThreadsInBlock(); -- 2.7.4