From c03fe73176067a2eb2ed5aca8ec685b6ea931b30 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 26 Apr 2019 19:30:34 +0000 Subject: [PATCH] [OPENMP][NVPTX]Correctly handle L2 parallelism in SPMD mode. Summary: The parallelLevel counter must be on per-thread basis to fully support L2+ parallelism, otherwise we may end up with undefined behavior. Introduce the parallelLevel on per-warp basis using shared memory. It allows to avoid the problems with the synchronization and allows fully support L2+ parallelism in SPMD mode with no runtime. Reviewers: gtbercea, grokos Subscribers: guansong, jdoerfert, caomhin, kkwli0, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D60918 llvm-svn: 359341 --- .../libomptarget/deviceRTLs/nvptx/src/libcall.cu | 2 +- .../libomptarget/deviceRTLs/nvptx/src/omp_data.cu | 2 +- .../deviceRTLs/nvptx/src/omptarget-nvptx.cu | 4 ++- .../deviceRTLs/nvptx/src/omptarget-nvptx.h | 3 +- .../libomptarget/deviceRTLs/nvptx/src/parallel.cu | 22 +++++++++------ openmp/libomptarget/deviceRTLs/nvptx/src/support.h | 2 ++ .../libomptarget/deviceRTLs/nvptx/src/supporti.h | 8 ++++-- .../nvptx/test/parallel/spmd_parallel_regions.cpp | 33 ++++++++++++++-------- 8 files changed, 50 insertions(+), 26 deletions(-) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu index 9bc3f2c..c3877a8 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -165,7 +165,7 @@ EXTERN int omp_get_level(void) { ASSERT0(LT_FUSSY, isSPMDMode(), "Expected SPMD mode only with uninitialized runtime."); // parallelLevel starts from 0, need to add 1 for correct level. - return parallelLevel + 1; + return parallelLevel[GetWarpId()] + 1; } int level = 0; omptarget_nvptx_TaskDescr *currTaskDescr = diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu index 27666ae..1c2da46 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -31,7 +31,7 @@ __device__ omptarget_nvptx_SimpleMemoryManager __device__ __shared__ uint32_t usedMemIdx; __device__ __shared__ uint32_t usedSlotIdx; -__device__ __shared__ uint8_t parallelLevel; +__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; // Pointer to this team's OpenMP state object __device__ __shared__ diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu index b69a3be..b5cfac3 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -95,8 +95,10 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, // If OMP runtime is not required don't initialize OMP state. setExecutionParameters(Spmd, RuntimeUninitialized); if (GetThreadIdInBlock() == 0) { - parallelLevel = 0; usedSlotIdx = smid() % MAX_SM; + parallelLevel[0] = 0; + } else if (GetLaneId() == 0) { + parallelLevel[GetWarpId()] = 0; } __SYNCTHREADS(); return; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index d178b57..8ee69de 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -406,7 +406,8 @@ extern __device__ omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; extern __device__ __shared__ uint32_t usedMemIdx; extern __device__ __shared__ uint32_t usedSlotIdx; -extern __device__ __shared__ uint8_t parallelLevel; +extern __device__ __shared__ uint8_t + parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; extern __device__ __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu index 273e15c..1ad72d5 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -339,10 +339,12 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) { if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected SPMD mode with uninitialized runtime."); - __SYNCTHREADS(); - if (GetThreadIdInBlock() == 0) - ++parallelLevel; - __SYNCTHREADS(); + unsigned tnum = __ACTIVEMASK(); + int leader = __ffs(tnum) - 1; + __SHFL_SYNC(tnum, leader, leader); + if (GetLaneId() == leader) + ++parallelLevel[GetWarpId()]; + __SHFL_SYNC(tnum, leader, leader); return; } @@ -382,10 +384,12 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc, if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected SPMD mode with uninitialized runtime."); - __SYNCTHREADS(); - if (GetThreadIdInBlock() == 0) - --parallelLevel; - __SYNCTHREADS(); + unsigned tnum = __ACTIVEMASK(); + int leader = __ffs(tnum) - 1; + __SHFL_SYNC(tnum, leader, leader); + if (GetLaneId() == leader) + --parallelLevel[GetWarpId()]; + __SHFL_SYNC(tnum, leader, leader); return; } @@ -407,7 +411,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) { if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected SPMD mode with uninitialized runtime."); - return parallelLevel + 1; + return parallelLevel[GetWarpId()] + 1; } int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h index 5d4b403..f84da6d 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h @@ -40,6 +40,8 @@ INLINE int GetThreadIdInBlock(); INLINE int GetBlockIdInKernel(); INLINE int GetNumberOfBlocksInKernel(); INLINE int GetNumberOfThreadsInBlock(); +INLINE unsigned GetWarpId(); +INLINE unsigned GetLaneId(); // get global ids to locate tread/team info (constant regardless of OMP) INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h index 2052c17..3f313a9 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -102,6 +102,10 @@ INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; } INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; } +INLINE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; } + +INLINE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); } + //////////////////////////////////////////////////////////////////////////////// // // Calls to the Generic Scheme Implementation Layer (assuming 1D layout) @@ -154,7 +158,7 @@ INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode, ASSERT0(LT_FUSSY, isSPMDExecutionMode, "Uninitialized runtime with non-SPMD mode."); // For level 2 parallelism all parallel regions are executed sequentially. - if (parallelLevel > 0) + if (parallelLevel[GetWarpId()] > 0) rc = 0; else rc = GetThreadIdInBlock(); @@ -175,7 +179,7 @@ INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode, ASSERT0(LT_FUSSY, isSPMDExecutionMode, "Uninitialized runtime with non-SPMD mode."); // For level 2 parallelism all parallel regions are executed sequentially. - if (parallelLevel > 0) + if (parallelLevel[GetWarpId()] > 0) rc = 1; else rc = GetNumberOfThreadsInBlock(); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp index 3e08d1d..517db59 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp @@ -6,24 +6,31 @@ int main(void) { int isHost = -1; int ParallelLevel1 = -1, ParallelLevel2 = -1; + int Count = 0; #pragma omp target parallel for map(tofrom \ - : isHost, ParallelLevel1, ParallelLevel2) + : isHost, ParallelLevel1, ParallelLevel2), reduction(+: Count) schedule(static, 1) for (int J = 0; J < 10; ++J) { #pragma omp critical { - isHost = (isHost < 0 || isHost == omp_is_initial_device()) - ? omp_is_initial_device() - : 1; - ParallelLevel1 = - (ParallelLevel1 < 0 || ParallelLevel1 == 1) ? omp_get_level() : 2; + isHost = (isHost < 0 || isHost == 0) ? omp_is_initial_device() : isHost; + ParallelLevel1 = (ParallelLevel1 < 0 || ParallelLevel1 == 1) + ? omp_get_level() + : ParallelLevel1; } - int L2; -#pragma omp parallel for schedule(dynamic) lastprivate(L2) - for (int I = 0; I < 10; ++I) - L2 = omp_get_level(); + if (omp_get_thread_num() > 5) { + int L2; +#pragma omp parallel for schedule(dynamic) lastprivate(L2) reduction(+: Count) + for (int I = 0; I < 10; ++I) { + L2 = omp_get_level(); + Count += omp_get_level(); // (10-6)*10*2 = 80 + } #pragma omp critical - ParallelLevel2 = (ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : 1; + ParallelLevel2 = + (ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : ParallelLevel2; + } else { + Count += omp_get_level(); // 6 * 1 = 6 + } } if (isHost < 0) { @@ -35,6 +42,10 @@ int main(void) { // CHECK: Parallel level in SPMD mode: L1 is 1, L2 is 2 printf("Parallel level in SPMD mode: L1 is %d, L2 is %d\n", ParallelLevel1, ParallelLevel2); + // Final result of Count is (10-6)(num of loops)*10(num of iterations)*2(par + // level) + 6(num of iterations) * 1(par level) + // CHECK: Expected count = 86 + printf("Expected count = %d\n", Count); return isHost; } -- 2.7.4