[OPENMP][NVPTX]Fixed/improved support for globalization in team contexts.

author Alexey Bataev <a.bataev@hotmail.com>

Fri, 2 Nov 2018 14:43:23 +0000 (14:43 +0000)

committer Alexey Bataev <a.bataev@hotmail.com>

Fri, 2 Nov 2018 14:43:23 +0000 (14:43 +0000)
author Alexey Bataev <a.bataev@hotmail.com>
Fri, 2 Nov 2018 14:43:23 +0000 (14:43 +0000)
committer Alexey Bataev <a.bataev@hotmail.com>
Fri, 2 Nov 2018 14:43:23 +0000 (14:43 +0000)
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu

index 4db9f31..bfb8208 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -370,11 +370,7 @@ EXTERN void __kmpc_data_sharing_init_stack_spmd() {
  }
  
  INLINE void* data_sharing_push_stack_common(size_t PushSize) {
-  if (isRuntimeUninitialized()) {
-    ASSERT0(LT_FUSSY, isSPMDMode(),
-            "Expected SPMD mode with uninitialized runtime.");
-    return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize);
-  }
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
  
    // Only warp active master threads manage the stack.
    bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0;
@@ -480,11 +476,7 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
  // reclaim all outstanding global memory slots since it is
  // likely we have reached the end of the kernel.
  EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
-  if (isRuntimeUninitialized()) {
-    ASSERT0(LT_FUSSY, isSPMDMode(),
-            "Expected SPMD mode with uninitialized runtime.");
-    return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart);
-  }
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
  
    __threadfence_block();
  
@@ -544,3 +536,44 @@ EXTERN void __kmpc_end_sharing_variables() {
  EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
    *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
  }
+
+// This function is used to init static memory manager. This manager is used to
+// manage statically allocated global memory. This memory is allocated by the
+// compiler and used to correctly implement globalization of the variables in
+// target, teams and distribute regions.
+EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size,
+                                          int16_t is_shared,
+                                          const void **frame) {
+  if (is_shared) {
+    *frame = buf;
+    return;
+  }
+  if (isSPMDMode()) {
+    if (GetThreadIdInBlock() == 0) {
+      *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
+    }
+    __syncthreads();
+    return;
+  }
+  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == getMasterThreadId(),
+          "Must be called only in the target master thread.");
+  *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
+  __threadfence();
+}
+
+EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared) {
+  if (is_shared)
+    return;
+  if (isSPMDMode()) {
+    __syncthreads();
+    if (GetThreadIdInBlock() == 0) {
+      omptarget_nvptx_simpleMemoryManager.Release();
+    }
+    return;
+  }
+  __threadfence();
+  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == getMasterThreadId(),
+          "Must be called only in the target master thread.");
+  omptarget_nvptx_simpleMemoryManager.Release();
+}
+
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h

index bf36a5a..7a37c04 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -514,4 +514,10 @@ __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
  
  // SPMD execution mode interrogation function.
  EXTERN int8_t __kmpc_is_spmd_exec_mode();
+
+EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size,
+                                          int16_t is_shared, const void **res);
+
+EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared);
+
  #endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu

index 0e13ce0..35f94ac 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -31,6 +31,11 @@ __device__ omptarget_nvptx_Queue<omptarget_nvptx_SimpleThreadPrivateContext,
                                   OMP_STATE_COUNT>
      omptarget_nvptx_device_simpleState[MAX_SM];
  
+__device__ omptarget_nvptx_SimpleMemoryManager
+    omptarget_nvptx_simpleMemoryManager;
+__device__ __shared__ uint32_t usedMemIdx;
+__device__ __shared__ uint32_t usedSlotIdx;
+
  // Pointer to this team's OpenMP state object
  __device__ __shared__
      omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
@@ -38,8 +43,6 @@ __device__ __shared__
  __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
      *omptarget_nvptx_simpleThreadPrivateContext;
  
-__device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
-
  ////////////////////////////////////////////////////////////////////////////////
  // The team master sets the outlined parallel function in this variable to
  // communicate with the workers.  Since it is in shared memory, there is one
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu

index 8b70fae..5d95eb1 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -25,18 +25,10 @@ extern __device__ omptarget_nvptx_Queue<
      omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT>
      omptarget_nvptx_device_simpleState[MAX_SM];
  
-extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
-
  ////////////////////////////////////////////////////////////////////////////////
  // init entry points
  ////////////////////////////////////////////////////////////////////////////////
  
-INLINE unsigned nsmid() {
-  unsigned n;
-  asm("mov.u32 %0, %%nsmid;" : "=r"(n));
-  return n;
-}
-
  INLINE unsigned smid() {
    unsigned id;
    asm("mov.u32 %0, %%smid;" : "=r"(id));
@@ -64,11 +56,9 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
  
    // Get a state object from the queue.
    int slot = smid() % MAX_SM;
+  usedSlotIdx = slot;
    omptarget_nvptx_threadPrivateContext =
        omptarget_nvptx_device_State[slot].Dequeue();
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  omptarget_nvptx_threadPrivateContext->SetSourceQueue(slot);
-#endif
  
    // init thread private
    int threadId = GetLogicalThreadIdInBlock();
@@ -94,11 +84,7 @@ EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
    ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized,
            "Generic always requires initialized runtime.");
    // Enqueue omp state object for use by another team.
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  int slot = omptarget_nvptx_threadPrivateContext->GetSourceQueue();
-#else
-  int slot = smid() % MAX_SM;
-#endif
+  int slot = usedSlotIdx;
    omptarget_nvptx_device_State[slot].Enqueue(
        omptarget_nvptx_threadPrivateContext);
    // Done with work.  Kill the workers.
@@ -114,12 +100,9 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
      setExecutionParameters(Spmd, RuntimeUninitialized);
      if (GetThreadIdInBlock() == 0) {
        int slot = smid() % MAX_SM;
+      usedSlotIdx = slot;
        omptarget_nvptx_simpleThreadPrivateContext =
            omptarget_nvptx_device_simpleState[slot].Dequeue();
-      // Reuse the memory allocated for the full runtime as the preallocated
-      // global memory buffer for the lightweight runtime.
-      omptarget_nvptx_simpleGlobalData =
-          omptarget_nvptx_device_State[slot].Dequeue();
      }
      __syncthreads();
      omptarget_nvptx_simpleThreadPrivateContext->Init();
@@ -136,6 +119,7 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
    if (threadId == 0) {
      // Get a state object from the queue.
      int slot = smid() % MAX_SM;
+    usedSlotIdx = slot;
      omptarget_nvptx_threadPrivateContext =
          omptarget_nvptx_device_State[slot].Dequeue();
  
@@ -186,19 +170,15 @@ EXTERN void __kmpc_spmd_kernel_deinit() {
    if (isRuntimeUninitialized()) {
      if (threadId == 0) {
        // Enqueue omp state object for use by another team.
-      int slot = smid() % MAX_SM;
+      int slot = usedSlotIdx;
        omptarget_nvptx_device_simpleState[slot].Enqueue(
            omptarget_nvptx_simpleThreadPrivateContext);
-      // Enqueue global memory back.
-      omptarget_nvptx_device_State[slot].Enqueue(
-          reinterpret_cast<omptarget_nvptx_ThreadPrivateContext *>(
-              omptarget_nvptx_simpleGlobalData));
      }
      return;
    }
    if (threadId == 0) {
      // Enqueue omp state object for use by another team.
-    int slot = smid() % MAX_SM;
+    int slot = usedSlotIdx;
      omptarget_nvptx_device_State[slot].Enqueue(
          omptarget_nvptx_threadPrivateContext);
    }
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h

index 5b621ea..2a6de28 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -344,8 +344,6 @@ public:
    INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
  
    INLINE void InitThreadPrivateContext(int tid);
-  INLINE void SetSourceQueue(uint64_t Src) { SourceQueue = Src; }
-  INLINE uint64_t GetSourceQueue() { return SourceQueue; }
  
  private:
    // team context for this team
@@ -368,8 +366,6 @@ private:
    // state for dispatch with dyn/guided OR static (never use both at a time)
    int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
    int64_t stride[MAX_THREADS_PER_TEAM];
-  // Queue to which this object must be returned.
-  uint64_t SourceQueue;
  };
  
  /// Device envrionment data
@@ -377,6 +373,22 @@ struct omptarget_device_environmentTy {
    int32_t debug_level;
  };
  
+/// Memory manager for statically allocated memory.
+class omptarget_nvptx_SimpleMemoryManager {
+private:
+  __align__(128) struct MemDataTy {
+    volatile unsigned keys[OMP_STATE_COUNT];
+  } MemData[MAX_SM];
+
+  INLINE uint32_t hash(unsigned key) const {
+    return key & (OMP_STATE_COUNT - 1);
+  }
+
+public:
+  INLINE void Release();
+  INLINE const void *Acquire(const void *buf, size_t size);
+};
+
  class omptarget_nvptx_SimpleThreadPrivateContext {
    uint16_t par_level[MAX_THREADS_PER_TEAM];
  
@@ -386,8 +398,6 @@ public:
              "Expected SPMD + uninitialized runtime modes.");
      par_level[GetThreadIdInBlock()] = 0;
    }
-  static INLINE void *Allocate(size_t DataSize);
-  static INLINE void Deallocate(void *Ptr);
    INLINE void IncParLevel() {
      ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
              "Expected SPMD + uninitialized runtime modes.");
@@ -424,6 +434,10 @@ extern __device__ omptarget_device_environmentTy omptarget_device_environment;
  // global data tables
  ////////////////////////////////////////////////////////////////////////////////
  
+extern __device__ omptarget_nvptx_SimpleMemoryManager
+    omptarget_nvptx_simpleMemoryManager;
+extern __device__ __shared__ uint32_t usedMemIdx;
+extern __device__ __shared__ uint32_t usedSlotIdx;
  extern __device__ __shared__
      omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
  extern __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h

index 3e072b6..f325601 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
@@ -204,34 +204,29 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() {
  }
  
  ////////////////////////////////////////////////////////////////////////////////
-// Lightweight runtime functions.
+// Memory management runtime functions.
  ////////////////////////////////////////////////////////////////////////////////
  
-// Shared memory buffer for globalization support.
-static __align__(16) __device__ __shared__ char
-    omptarget_static_buffer[DS_Shared_Memory_Size];
-static __device__ __shared__ void *omptarget_spmd_allocated;
-
-extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
-
-INLINE void *
-omptarget_nvptx_SimpleThreadPrivateContext::Allocate(size_t DataSize) {
-  if (DataSize <= DS_Shared_Memory_Size)
-    return ::omptarget_static_buffer;
-  if (DataSize <= sizeof(omptarget_nvptx_ThreadPrivateContext))
-    return ::omptarget_nvptx_simpleGlobalData;
-  if (threadIdx.x == 0)
-    omptarget_spmd_allocated = SafeMalloc(DataSize, "SPMD teams alloc");
-  __syncthreads();
-  return omptarget_spmd_allocated;
-}
-
-INLINE void
-omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(void *Ptr) {
-  if (Ptr != ::omptarget_static_buffer &&
-      Ptr != ::omptarget_nvptx_simpleGlobalData) {
-    __syncthreads();
-    if (threadIdx.x == 0)
-      SafeFree(Ptr, "SPMD teams dealloc");
+INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
+  ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
+          "SlotIdx is too big or uninitialized.");
+  ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
+          "MemIdx is too big or uninitialized.");
+  MemDataTy &MD = MemData[usedSlotIdx];
+  atomicExch((unsigned *)&MD.keys[usedMemIdx], 0);
+}
+
+INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
+                                                                size_t size) {
+  ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
+          "SlotIdx is too big or uninitialized.");
+  const unsigned sm = usedSlotIdx;
+  MemDataTy &MD = MemData[sm];
+  unsigned i = hash(GetBlockIdInKernel());
+  while (atomicCAS((unsigned *)&MD.keys[i], 0, 1) != 0) {
+    i = hash(i + 1);
    }
+  usedSlotIdx = sm;
+  usedMemIdx = i;
+  return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size;
  }
author	Alexey Bataev <a.bataev@hotmail.com>
	Fri, 2 Nov 2018 14:43:23 +0000 (14:43 +0000)
committer	Alexey Bataev <a.bataev@hotmail.com>
	Fri, 2 Nov 2018 14:43:23 +0000 (14:43 +0000)
openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/interface.h		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h		patch \| blob \| history