[OpenMP][libomptarget] Add runtime function for pushing coalesced global records

author Gheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>

Thu, 1 Nov 2018 18:08:12 +0000 (18:08 +0000)

committer Gheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>

Thu, 1 Nov 2018 18:08:12 +0000 (18:08 +0000)
author Gheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>
Thu, 1 Nov 2018 18:08:12 +0000 (18:08 +0000)
committer Gheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>
Thu, 1 Nov 2018 18:08:12 +0000 (18:08 +0000)
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu

index c7b9bdf9a9b2a93d739d72b63d655fdc4ee0262d..4db9f31a55d76765e07af96b19e60f97254982e7 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -129,7 +129,7 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
  
    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
    void *&StackP = DataSharingState.StackPtr[WID];
-  void *&FrameP = DataSharingState.FramePtr[WID];
+  void * volatile &FrameP = DataSharingState.FramePtr[WID];
    int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
  
    DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
@@ -283,7 +283,7 @@ EXTERN void __kmpc_data_sharing_environment_end(
  
        __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
        void *&StackP = DataSharingState.StackPtr[WID];
-      void *&FrameP = DataSharingState.FramePtr[WID];
+      void * volatile &FrameP = DataSharingState.FramePtr[WID];
  
        SlotP = *SavedSharedSlot;
        StackP = *SavedSharedStack;
@@ -321,7 +321,7 @@ __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
  
    DSPRINT(DSFLAG, "Source  warp: %d\n", SourceWID);
  
-  void *P = DataSharingState.FramePtr[SourceWID];
+  void * volatile P = DataSharingState.FramePtr[SourceWID];
    DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
    return P;
  }
@@ -369,47 +369,31 @@ EXTERN void __kmpc_data_sharing_init_stack_spmd() {
    __threadfence_block();
  }
  
-// Called at the time of the kernel initialization. This is used to initilize
-// the list of references to shared variables and to pre-allocate global storage
-// for holding the globalized variables.
-//
-// By default the globalized variables are stored in global memory. If the
-// UseSharedMemory is set to true, the runtime will attempt to use shared memory
-// as long as the size requested fits the pre-allocated size.
-EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
-    int16_t UseSharedMemory) {
+INLINE void* data_sharing_push_stack_common(size_t PushSize) {
    if (isRuntimeUninitialized()) {
      ASSERT0(LT_FUSSY, isSPMDMode(),
              "Expected SPMD mode with uninitialized runtime.");
-    return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize);
+    return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize);
    }
  
+  // Only warp active master threads manage the stack.
+  bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0;
+
    // Add worst-case padding to DataSize so that future stack allocations are
    // correctly aligned.
    const size_t Alignment = 8;
-  if (DataSize % Alignment != 0) {
-    DataSize += (Alignment - DataSize % Alignment);
-  }
+  PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
  
    // Frame pointer must be visible to all workers in the same warp.
    unsigned WID = getWarpId();
-  void *&FrameP = DataSharingState.FramePtr[WID];
+  void *volatile &FrameP = DataSharingState.FramePtr[WID];
  
-  // Only warp active master threads manage the stack.
-  if (getThreadId() % WARPSIZE == 0) {
+  if (IsWarpMaster) {
      // SlotP will point to either the shared memory slot or an existing
      // global memory slot.
      __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
      void *&StackP = DataSharingState.StackPtr[WID];
  
-    // Compute the total memory footprint of the requested data.
-    // The master thread requires a stack only for itself. A worker
-    // thread (which at this point is a warp master) will require
-    // space for the variables of each thread in the warp,
-    // i.e. one DataSize chunk per warp lane.
-    // TODO: change WARPSIZE to the number of active threads in the warp.
-    size_t PushSize = IsMasterThread() ? DataSize : WARPSIZE * DataSize;
-
      // Check if we have room for the data in the current slot.
      const uintptr_t StartAddress = (uintptr_t)StackP;
      const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
@@ -453,12 +437,39 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
        // Reset stack pointer to the requested address.
        StackP = (void *)RequestedEndAddress;
      }
+  } else {
+    while (!FrameP);
    }
  
-  __threadfence_block();
+  return FrameP;
+}
+
+EXTERN void* __kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
+    int16_t UseSharedMemory) {
+  return data_sharing_push_stack_common(DataSize);
+}
+
+// Called at the time of the kernel initialization. This is used to initilize
+// the list of references to shared variables and to pre-allocate global storage
+// for holding the globalized variables.
+//
+// By default the globalized variables are stored in global memory. If the
+// UseSharedMemory is set to true, the runtime will attempt to use shared memory
+// as long as the size requested fits the pre-allocated size.
+EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
+    int16_t UseSharedMemory) {
+  // Compute the total memory footprint of the requested data.
+  // The master thread requires a stack only for itself. A worker
+  // thread (which at this point is a warp master) will require
+  // space for the variables of each thread in the warp,
+  // i.e. one DataSize chunk per warp lane.
+  // TODO: change WARPSIZE to the number of active threads in the warp.
+  size_t PushSize = (isRuntimeUninitialized() || IsMasterThread()) ?
+      DataSize : WARPSIZE * DataSize;
  
    // Compute the start address of the frame of each thread in the warp.
-  uintptr_t FrameStartAddress = (uintptr_t)FrameP;
+  uintptr_t FrameStartAddress =
+      (uintptr_t) data_sharing_push_stack_common(PushSize);
    FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
    return (void *)FrameStartAddress;
  }
@@ -475,6 +486,8 @@ EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
      return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart);
    }
  
+  __threadfence_block();
+
    if (getThreadId() % WARPSIZE == 0) {
      unsigned WID = getWarpId();
  
@@ -501,8 +514,6 @@ EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
        SlotP->Next = 0;
      }
    }
-
-  __threadfence_block();
  }
  
  // Begin a data sharing context. Maintain a list of references to shared
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h

index aca8fbe7e88406b5cc3ba3c26332a17ae33cd41d..bf36a5a3e6a5e081e16fadfcd950e140603e4ed8 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -478,6 +478,8 @@ EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
  
  EXTERN void __kmpc_data_sharing_init_stack();
  EXTERN void __kmpc_data_sharing_init_stack_spmd();
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
+    int16_t UseSharedMemory);
  EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
  EXTERN void __kmpc_data_sharing_pop_stack(void *a);
  EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu

index f23679ca7b03386ed503ef8023931d7fb5fe82e5..8b70faef04b0b32a2d1b3d4294d64049860777be 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -40,8 +40,6 @@ INLINE unsigned nsmid() {
  INLINE unsigned smid() {
    unsigned id;
    asm("mov.u32 %0, %%smid;" : "=r"(id));
-  ASSERT0(LT_FUSSY, nsmid() <= MAX_SM,
-          "Expected number of SMs is less than reported.");
    return id;
  }
  
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h

index e0d4c1679cd29441c0757a91a516765698dee572..5b621ea5b79e1b58a396be98447f2db3e2bd5464 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -123,7 +123,7 @@ enum DATA_SHARING_SIZES {
  struct DataSharingStateTy {
    __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
    void *StackPtr[DS_Max_Warp_Number];
-  void *FramePtr[DS_Max_Warp_Number];
+  void * volatile FramePtr[DS_Max_Warp_Number];
    int32_t ActiveThreads[DS_Max_Warp_Number];
  };
  // Additional worker slot type which is initialized with the default worker slot
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h

index 9cdcc162dd48b053c25f57df6541f8dc97dbca17..c93657e45e1a0177f3b9faeaad33f570805f2f9f 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -188,7 +188,6 @@ INLINE void *SafeMalloc(size_t size, const char *msg) // check if success
  {
    void *ptr = malloc(size);
    PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr));
-  ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg);
    return ptr;
  }
author	Gheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>
	Thu, 1 Nov 2018 18:08:12 +0000 (18:08 +0000)
committer	Gheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>
	Thu, 1 Nov 2018 18:08:12 +0000 (18:08 +0000)
openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/interface.h		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h		patch \| blob \| history