From d12ee28e2e4c3f551f1a79d27e3f10736f9b97cf Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Thu, 22 Jul 2021 12:18:46 -0500 Subject: [PATCH] [OpenMP] Simplify the ThreadStackTy for globalization fallback With D106496 we can make the globalization fallback stack much simpler and this version doesn't seem to experience the spurious failures and deadlocks we have seen before. Differential Revision: https://reviews.llvm.org/D106576 --- .../deviceRTLs/common/src/data_sharing.cu | 106 ++++++--------------- 1 file changed, 31 insertions(+), 75 deletions(-) diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu index 19702c1f..491c279 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu @@ -21,114 +21,70 @@ static constexpr unsigned MinBytes = 8; -template +template struct alignas(32) ThreadStackTy { - static constexpr unsigned MaxSize = NThreads * BytesPerThread; + static constexpr unsigned BytesPerThread = BPerThread; static constexpr unsigned NumThreads = NThreads; static constexpr unsigned NumWarps = (NThreads + WARPSIZE - 1) / WARPSIZE; - static constexpr unsigned MaxSizePerWarp = MaxSize / NumWarps; - unsigned char Data[MaxSize]; - char Sizes[MaxSize / MinBytes]; - char SizeUsage[NumWarps]; - char Usage[NumWarps]; + unsigned char Data[NumThreads][BytesPerThread]; + unsigned char Usage[NumThreads]; }; [[clang::loader_uninitialized]] ThreadStackTy MainSharedStack; #pragma omp allocate(MainSharedStack) allocator(omp_pteam_mem_alloc) -[[clang::loader_uninitialized]] ThreadStackTy +[[clang::loader_uninitialized]] ThreadStackTy WorkerSharedStack; #pragma omp allocate(WorkerSharedStack) allocator(omp_pteam_mem_alloc) -template -static void *__kmpc_alloc_for_warp(AllocTy Alloc, unsigned Bytes, - unsigned WarpBytes) { - void *Ptr; - __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); - unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1; - bool IsWarpLeader = - (__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID; - if (IsWarpLeader) - Ptr = Alloc(); - // Get address from the first active lane. - int *FP = (int *)&Ptr; - FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], LeaderID); - if (sizeof(Ptr) == 8) - FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], LeaderID); - return (void *)&((char *)(Ptr))[(GetLaneId() - LeaderID) * Bytes]; -} - EXTERN void *__kmpc_alloc_shared(size_t Bytes) { - Bytes = Bytes + (Bytes % MinBytes); + size_t AlignedBytes = Bytes + (Bytes % MinBytes); int TID = __kmpc_get_hardware_thread_id_in_block(); if (__kmpc_is_generic_main_thread(TID)) { // Main thread alone, use shared memory if space is available. - if (MainSharedStack.Usage[0] + Bytes <= MainSharedStack.MaxSize) { - void *Ptr = &MainSharedStack.Data[MainSharedStack.Usage[0]]; - MainSharedStack.Usage[0] += Bytes; - MainSharedStack.Sizes[MainSharedStack.SizeUsage[0]++] = Bytes; + if (MainSharedStack.Usage[0] + AlignedBytes <= MainSharedStack.BytesPerThread) { + void *Ptr = &MainSharedStack.Data[0][MainSharedStack.Usage[0]]; + MainSharedStack.Usage[0] += AlignedBytes; return Ptr; } - } else { - int WID = GetWarpId(); - unsigned WarpBytes = Bytes * WARPSIZE; - auto AllocSharedStack = [&]() { - unsigned WarpOffset = WID * WorkerSharedStack.MaxSizePerWarp; - void *Ptr = - &WorkerSharedStack.Data[WarpOffset + WorkerSharedStack.Usage[WID]]; - WorkerSharedStack.Usage[WID] += WarpBytes; - WorkerSharedStack.Sizes[WorkerSharedStack.SizeUsage[WID]++] = WarpBytes; + } else if (TID < WorkerSharedStack.NumThreads) { + if (WorkerSharedStack.Usage[TID] + AlignedBytes <= WorkerSharedStack.BytesPerThread) { + void *Ptr = &WorkerSharedStack.Data[TID][WorkerSharedStack.Usage[TID]]; + WorkerSharedStack.Usage[TID] += AlignedBytes; return Ptr; - }; - if (TID < WorkerSharedStack.NumThreads && - WorkerSharedStack.Usage[WID] + WarpBytes <= - WorkerSharedStack.MaxSizePerWarp) - return __kmpc_alloc_for_warp(AllocSharedStack, Bytes, WarpBytes); + } } // Fallback to malloc - unsigned WarpBytes = Bytes * WARPSIZE; - auto AllocGlobal = [&] { - return SafeMalloc(WarpBytes, "AllocGlobalFallback"); - }; - return __kmpc_alloc_for_warp(AllocGlobal, Bytes, WarpBytes); + return SafeMalloc(Bytes, "AllocGlobalFallback"); } -EXTERN void __kmpc_free_shared(void *Ptr, size_t /* Bytes */) { - __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); - unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1; - bool IsWarpLeader = - (__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID; - __kmpc_syncwarp(CurActive); - if (IsWarpLeader) { - if (Ptr >= &MainSharedStack.Data[0] && - Ptr < &MainSharedStack.Data[MainSharedStack.MaxSize]) { - unsigned Bytes = MainSharedStack.Sizes[--MainSharedStack.SizeUsage[0]]; - MainSharedStack.Usage[0] -= Bytes; +EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes) { + size_t AlignedBytes = Bytes + (Bytes % MinBytes); + int TID = __kmpc_get_hardware_thread_id_in_block(); + if (__kmpc_is_generic_main_thread(TID)) { + if (Ptr >= &MainSharedStack.Data[0][0] && + Ptr < &MainSharedStack.Data[MainSharedStack.NumThreads][0]) { + MainSharedStack.Usage[0] -= AlignedBytes; return; } - if (Ptr >= &WorkerSharedStack.Data[0] && - Ptr < &WorkerSharedStack.Data[WorkerSharedStack.MaxSize]) { - int WID = GetWarpId(); - unsigned Bytes = - WorkerSharedStack.Sizes[--WorkerSharedStack.SizeUsage[WID]]; - WorkerSharedStack.Usage[WID] -= Bytes; + } else if (TID < WorkerSharedStack.NumThreads) { + if (Ptr >= &WorkerSharedStack.Data[0][0] && + Ptr < &WorkerSharedStack.Data[WorkerSharedStack.NumThreads][0]) { + int TID = __kmpc_get_hardware_thread_id_in_block(); + WorkerSharedStack.Usage[TID] -= AlignedBytes; return; } - SafeFree(Ptr, "FreeGlobalFallback"); } + SafeFree(Ptr, "FreeGlobalFallback"); } EXTERN void __kmpc_data_sharing_init_stack() { - for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i) { - MainSharedStack.SizeUsage[i] = 0; + for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i) MainSharedStack.Usage[i] = 0; - } - for (unsigned i = 0; i < WorkerSharedStack.NumWarps; ++i) { - WorkerSharedStack.SizeUsage[i] = 0; + for (unsigned i = 0; i < WorkerSharedStack.NumThreads; ++i) WorkerSharedStack.Usage[i] = 0; - } } /// Allocate storage in shared memory to communicate arguments from the main -- 2.7.4