[OPENMP][NVPTX] Add support for lastprivates/reductions handling in SPMD constructs...

author Alexey Bataev <a.bataev@hotmail.com>

Fri, 21 Sep 2018 14:11:41 +0000 (14:11 +0000)

committer Alexey Bataev <a.bataev@hotmail.com>

Fri, 21 Sep 2018 14:11:41 +0000 (14:11 +0000)
author Alexey Bataev <a.bataev@hotmail.com>
Fri, 21 Sep 2018 14:11:41 +0000 (14:11 +0000)
committer Alexey Bataev <a.bataev@hotmail.com>
Fri, 21 Sep 2018 14:11:41 +0000 (14:11 +0000)
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu

index 4a28a7c..2b3a90f 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -378,6 +378,12 @@ EXTERN void __kmpc_data_sharing_init_stack_spmd() {
  // as long as the size requested fits the pre-allocated size.
  EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
      int16_t UseSharedMemory) {
+  if (isRuntimeUninitialized()) {
+    ASSERT0(LT_FUSSY, isSPMDMode(),
+            "Expected SPMD mode with uninitialized runtime.");
+    return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize);
+  }
+
    // Frame pointer must be visible to all workers in the same warp.
    unsigned WID = getWarpId();
    void *&FrameP = DataSharingState.FramePtr[WID];
@@ -456,6 +462,12 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
  // reclaim all outstanding global memory slots since it is
  // likely we have reached the end of the kernel.
  EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
+  if (isRuntimeUninitialized()) {
+    ASSERT0(LT_FUSSY, isSPMDMode(),
+            "Expected SPMD mode with uninitialized runtime.");
+    return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart);
+  }
+
    if (IsWarpMasterActiveThread()) {
      unsigned WID = getWarpId();
  
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu

index fcecaf3..0e13ce0 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -38,6 +38,8 @@ __device__ __shared__
  __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
      *omptarget_nvptx_simpleThreadPrivateContext;
  
+__device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
+
  ////////////////////////////////////////////////////////////////////////////////
  // The team master sets the outlined parallel function in this variable to
  // communicate with the workers.  Since it is in shared memory, there is one
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu

index fb28c4c..f23679c 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -25,13 +25,23 @@ extern __device__ omptarget_nvptx_Queue<
      omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT>
      omptarget_nvptx_device_simpleState[MAX_SM];
  
+extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
+
  ////////////////////////////////////////////////////////////////////////////////
  // init entry points
  ////////////////////////////////////////////////////////////////////////////////
  
+INLINE unsigned nsmid() {
+  unsigned n;
+  asm("mov.u32 %0, %%nsmid;" : "=r"(n));
+  return n;
+}
+
  INLINE unsigned smid() {
    unsigned id;
    asm("mov.u32 %0, %%smid;" : "=r"(id));
+  ASSERT0(LT_FUSSY, nsmid() <= MAX_SM,
+          "Expected number of SMs is less than reported.");
    return id;
  }
  
@@ -108,6 +118,10 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
        int slot = smid() % MAX_SM;
        omptarget_nvptx_simpleThreadPrivateContext =
            omptarget_nvptx_device_simpleState[slot].Dequeue();
+      // Reuse the memory allocated for the full runtime as the preallocated
+      // global memory buffer for the lightweight runtime.
+      omptarget_nvptx_simpleGlobalData =
+          omptarget_nvptx_device_State[slot].Dequeue();
      }
      __syncthreads();
      omptarget_nvptx_simpleThreadPrivateContext->Init();
@@ -177,6 +191,10 @@ EXTERN void __kmpc_spmd_kernel_deinit() {
        int slot = smid() % MAX_SM;
        omptarget_nvptx_device_simpleState[slot].Enqueue(
            omptarget_nvptx_simpleThreadPrivateContext);
+      // Enqueue global memory back.
+      omptarget_nvptx_device_State[slot].Enqueue(
+          reinterpret_cast<omptarget_nvptx_ThreadPrivateContext *>(
+              omptarget_nvptx_simpleGlobalData));
      }
      return;
    }
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h

index f6e35a4..2d7f10d 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -113,6 +113,8 @@ enum DATA_SHARING_SIZES {
    DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
    // The maximum number of warps in use
    DS_Max_Warp_Number = 32,
+  // The size of the preallocated shared memory buffer per team
+  DS_Shared_Memory_Size = 128,
  };
  
  // Data structure to keep in shared memory that traces the current slot, stack,
@@ -386,12 +388,15 @@ struct omptarget_device_environmentTy {
  
  class omptarget_nvptx_SimpleThreadPrivateContext {
    uint16_t par_level[MAX_THREADS_PER_TEAM];
+
  public:
    INLINE void Init() {
      ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
              "Expected SPMD + uninitialized runtime modes.");
      par_level[GetThreadIdInBlock()] = 0;
    }
+  static INLINE void *Allocate(size_t DataSize);
+  static INLINE void Deallocate(void *Ptr);
    INLINE void IncParLevel() {
      ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
              "Expected SPMD + uninitialized runtime modes.");
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h

index 1cca820..3e072b6 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
@@ -202,3 +202,36 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
  INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() {
    return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
  }
+
+////////////////////////////////////////////////////////////////////////////////
+// Lightweight runtime functions.
+////////////////////////////////////////////////////////////////////////////////
+
+// Shared memory buffer for globalization support.
+static __align__(16) __device__ __shared__ char
+    omptarget_static_buffer[DS_Shared_Memory_Size];
+static __device__ __shared__ void *omptarget_spmd_allocated;
+
+extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
+
+INLINE void *
+omptarget_nvptx_SimpleThreadPrivateContext::Allocate(size_t DataSize) {
+  if (DataSize <= DS_Shared_Memory_Size)
+    return ::omptarget_static_buffer;
+  if (DataSize <= sizeof(omptarget_nvptx_ThreadPrivateContext))
+    return ::omptarget_nvptx_simpleGlobalData;
+  if (threadIdx.x == 0)
+    omptarget_spmd_allocated = SafeMalloc(DataSize, "SPMD teams alloc");
+  __syncthreads();
+  return omptarget_spmd_allocated;
+}
+
+INLINE void
+omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(void *Ptr) {
+  if (Ptr != ::omptarget_static_buffer &&
+      Ptr != ::omptarget_nvptx_simpleGlobalData) {
+    __syncthreads();
+    if (threadIdx.x == 0)
+      SafeFree(Ptr, "SPMD teams dealloc");
+  }
+}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/option.h b/openmp/libomptarget/deviceRTLs/nvptx/src/option.h

index 791d6f3..37ab818 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/option.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/option.h
@@ -34,7 +34,10 @@
  
  // Maximum number of omp state objects per SM allocated statically in global
  // memory.
-#if __CUDA_ARCH__ >= 600
+#if __CUDA_ARCH__ >= 700
+#define OMP_STATE_COUNT 32
+#define MAX_SM 84
+#elif __CUDA_ARCH__ >= 600
  #define OMP_STATE_COUNT 32
  #define MAX_SM 56
  #else
author	Alexey Bataev <a.bataev@hotmail.com>
	Fri, 21 Sep 2018 14:11:41 +0000 (14:11 +0000)
committer	Alexey Bataev <a.bataev@hotmail.com>
	Fri, 21 Sep 2018 14:11:41 +0000 (14:11 +0000)
openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/option.h		patch \| blob \| history