[OPENMP][NVPTX]Use __syncwarp() to reconverge the threads.

author Alexey Bataev <a.bataev@hotmail.com>

Fri, 23 Aug 2019 18:34:48 +0000 (18:34 +0000)

committer Alexey Bataev <a.bataev@hotmail.com>

Fri, 23 Aug 2019 18:34:48 +0000 (18:34 +0000)
author Alexey Bataev <a.bataev@hotmail.com>
Fri, 23 Aug 2019 18:34:48 +0000 (18:34 +0000)
committer Alexey Bataev <a.bataev@hotmail.com>
Fri, 23 Aug 2019 18:34:48 +0000 (18:34 +0000)
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h

index f28284ded6b66cc0a833d909e1a248904b511807..a5e4a71bdf3a9047f64601fd45281e5f720f204b 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -55,11 +55,14 @@
  #define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
    __shfl_down_sync((mask), (var), (delta), (width))
  #define __ACTIVEMASK() __activemask()
+#define __SYNCWARP(Mask) __syncwarp(Mask)
  #else
  #define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
  #define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
    __shfl_down((var), (delta), (width))
  #define __ACTIVEMASK() __ballot(1)
+// In Cuda < 9.0 no need to sync threads in warps.
+#define __SYNCWARP(Mask)
  #endif // CUDA_VERSION
  
  #define __SYNCTHREADS_N(n) asm volatile("bar.sync %0;" : : "r"(n) : "memory");
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h

index ceb395153f1fb71b78d39205a9febfa3c8ac5938..ceed7d3f7c81e79cb535b2f9f149c668d7b9e669 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -202,25 +202,31 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
  // Parallel level
  
  INLINE void IncParallelLevel(bool ActiveParallel) {
-  unsigned tnum = __ACTIVEMASK();
-  int leader = __ffs(tnum) - 1;
-  __SHFL_SYNC(tnum, leader, leader);
-  if (GetLaneId() == leader) {
+  unsigned Active = __ACTIVEMASK();
+  __SYNCWARP(Active);
+  unsigned LaneMaskLt;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
+  unsigned Rank = __popc(Active & LaneMaskLt);
+  if (Rank == 0) {
      parallelLevel[GetWarpId()] +=
          (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
+    __threadfence();
    }
-  __SHFL_SYNC(tnum, leader, leader);
+  __SYNCWARP(Active);
  }
  
  INLINE void DecParallelLevel(bool ActiveParallel) {
-  unsigned tnum = __ACTIVEMASK();
-  int leader = __ffs(tnum) - 1;
-  __SHFL_SYNC(tnum, leader, leader);
-  if (GetLaneId() == leader) {
+  unsigned Active = __ACTIVEMASK();
+  __SYNCWARP(Active);
+  unsigned LaneMaskLt;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
+  unsigned Rank = __popc(Active & LaneMaskLt);
+  if (Rank == 0) {
      parallelLevel[GetWarpId()] -=
          (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
+    __threadfence();
    }
-  __SHFL_SYNC(tnum, leader, leader);
+  __SYNCWARP(Active);
  }
  
  ////////////////////////////////////////////////////////////////////////////////
author	Alexey Bataev <a.bataev@hotmail.com>
	Fri, 23 Aug 2019 18:34:48 +0000 (18:34 +0000)
committer	Alexey Bataev <a.bataev@hotmail.com>
	Fri, 23 Aug 2019 18:34:48 +0000 (18:34 +0000)
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h		patch \| blob \| history