From: Alexey Bataev Date: Fri, 3 May 2019 20:00:38 +0000 (+0000) Subject: [OPENMP][NVPTX]Improve thread limit counter, NFC. X-Git-Tag: llvmorg-10-init~6317 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=a857e310115cd21f187c028777c582e3089cf722;p=platform%2Fupstream%2Fllvm.git [OPENMP][NVPTX]Improve thread limit counter, NFC. Summary: Patch improves performance of the full runtime mode by moving thread-limit counter to the shared memory. It also allows to save global memory. Reviewers: grokos, gtbercea, kkwli0 Subscribers: guansong, jdoerfert, caomhin, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D61526 llvm-svn: 359922 --- diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu index 681b819..ae6f83f 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -70,10 +70,7 @@ EXTERN int omp_get_max_threads(void) { EXTERN int omp_get_thread_limit(void) { if (isSPMDMode()) return GetNumberOfThreadsInBlock(); - // per contention group.. meaning threads in current team - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false); - int rc = currTaskDescr->ThreadLimit(); + int rc = threadLimit; PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc); return rc; } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu index 1c2da46..5d6a1a3 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -32,7 +32,7 @@ __device__ __shared__ uint32_t usedMemIdx; __device__ __shared__ uint32_t usedSlotIdx; __device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; - +__device__ __shared__ uint16_t threadLimit; // Pointer to this team's OpenMP state object __device__ __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu index dc41a80..3c9141c 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -74,7 +74,7 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) { omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); currTaskDescr->NThreads() = GetNumberOfWorkersInTeam(); - currTaskDescr->ThreadLimit() = ThreadLimit; + threadLimit = ThreadLimit; } EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) { @@ -139,7 +139,6 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); newTaskDescr->InitLevelOneTaskDescr(ThreadLimit, currTeamDescr.LevelZeroTaskDescr()); - newTaskDescr->ThreadLimit() = ThreadLimit; // install new top descriptor omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, newTaskDescr); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index 8ee69de..f55caab 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -165,7 +165,6 @@ public: INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); } // methods for other fields INLINE uint16_t &NThreads() { return items.nthreads; } - INLINE uint16_t &ThreadLimit() { return items.threadlimit; } INLINE uint16_t &ThreadId() { return items.threadId; } INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; } INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } @@ -213,7 +212,6 @@ private: uint8_t flags; // 6 bit used (see flag above) uint8_t unused; uint16_t nthreads; // thread num for subsequent parallel regions - uint16_t threadlimit; // thread limit ICV uint16_t threadId; // thread id uint16_t threadsInTeam; // threads in current team uint64_t runtimeChunkSize; // runtime chunk size @@ -408,6 +406,7 @@ extern __device__ __shared__ uint32_t usedMemIdx; extern __device__ __shared__ uint32_t usedSlotIdx; extern __device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +extern __device__ __shared__ uint16_t threadLimit; extern __device__ __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu index 45a6758..6bb61b1 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -249,9 +249,8 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, uint16_t &NumThreadsClause = omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); - uint16_t NumThreads = - determineNumberOfThreads(NumThreadsClause, currTaskDescr->NThreads(), - currTaskDescr->ThreadLimit()); + uint16_t NumThreads = determineNumberOfThreads( + NumThreadsClause, currTaskDescr->NThreads(), threadLimit); if (NumThreadsClause != 0) { // Reset request to avoid propagating to successive #parallel