[OPENMP][NVPTX]Fix barriers and parallel level counters, NFC.
authorAlexey Bataev <a.bataev@hotmail.com>
Wed, 22 May 2019 19:50:32 +0000 (19:50 +0000)
committerAlexey Bataev <a.bataev@hotmail.com>
Wed, 22 May 2019 19:50:32 +0000 (19:50 +0000)
Summary:
Parallel level counter should be volatile to prevent some dangerous
optimiations by the ptxas. Otherwise, ptxas optimizations lead to
undefined behaviour in some cases.
Also, use __threadfence() for #pragma omp flush and if the barrier
should not be used (we have only one thread in the team), still perform
flush operation since the standard requires implicit flush when
executing barriers.

Reviewers: gtbercea, kkwli0, grokos

Subscribers: guansong, jfb, jdoerfert, openmp-commits, caomhin

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D62199

llvm-svn: 361421

openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu

index d369da1cb7e73105d4b8b623c8bf47c8cb19aebb..0cd9b57fd7cf5135dc784a52e3a81b46d0966d67 100644 (file)
@@ -31,7 +31,8 @@ __device__ omptarget_nvptx_SimpleMemoryManager
 __device__ __shared__ uint32_t usedMemIdx;
 __device__ __shared__ uint32_t usedSlotIdx;
 
-__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
+__device__ __shared__ volatile uint8_t
+    parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
 __device__ __shared__ uint16_t threadLimit;
 __device__ __shared__ uint16_t threadsInTeam;
 __device__ __shared__ uint16_t nThreads;
index cd51538ad795ca63563018ee411e99de9ec3dd55..b85d0a750f2a9c94614be8c6771c3ea7000df1b8 100644 (file)
@@ -398,7 +398,7 @@ extern __device__ omptarget_nvptx_SimpleMemoryManager
     omptarget_nvptx_simpleMemoryManager;
 extern __device__ __shared__ uint32_t usedMemIdx;
 extern __device__ __shared__ uint32_t usedSlotIdx;
-extern __device__ __shared__ uint8_t
+extern __device__ __shared__ volatile uint8_t
     parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
 extern __device__ __shared__ uint16_t threadLimit;
 extern __device__ __shared__ uint16_t threadsInTeam;
index d81aa8f0f3accfa033d3f34294f4e2c09c238505..191b046c9f46f200b2939d636ba6e5bcea0f3025 100644 (file)
@@ -62,6 +62,8 @@ EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
         // Barrier #1 is for synchronization among active threads.
         named_sync(L1_BARRIER, threads);
       }
+    } else {
+      __kmpc_flush(loc_ref);
     } // numberOfActiveOMPThreads > 1
     PRINT0(LD_SYNC, "completed kmpc_barrier\n");
   }
@@ -130,7 +132,7 @@ EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) {
 
 EXTERN void __kmpc_flush(kmp_Ident *loc) {
   PRINT0(LD_IO, "call kmpc_flush\n");
-  __threadfence_system();
+  __threadfence();
 }
 
 ////////////////////////////////////////////////////////////////////////////////