[OpenMP] Add support for dynamic shared memory in new RTL

author Joseph Huber <jhuber6@vols.utk.edu>

Fri, 17 Sep 2021 18:53:58 +0000 (14:53 -0400)

committer Joseph Huber <jhuber6@vols.utk.edu>

Sat, 18 Sep 2021 01:25:36 +0000 (21:25 -0400)
author Joseph Huber <jhuber6@vols.utk.edu>
Fri, 17 Sep 2021 18:53:58 +0000 (14:53 -0400)
committer Joseph Huber <jhuber6@vols.utk.edu>
Sat, 18 Sep 2021 01:25:36 +0000 (21:25 -0400)
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst

index b8b8da8..2d8bd02 100644 (file)
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -32,6 +32,7 @@ variables is defined below.
      * ``LIBOMPTARGET_INFO=<Num>``
      * ``LIBOMPTARGET_HEAP_SIZE=<Num>``
      * ``LIBOMPTARGET_STACK_SIZE=<Num>``
+    * ``LIBOMPTARGET_SHARED_MEMORY_SIZE=<Num>``
  
  LIBOMPTARGET_DEBUG
  """"""""""""""""""
@@ -338,6 +339,14 @@ allocated using ``malloc`` and ``free`` for the CUDA plugin. This is necessary
  for some applications that allocate too much memory either through the user or
  globalization.
  
+LIBOMPTARGET_SHARED_MEMORY_SIZE
+"""""""""""""""""""""""""""""""
+
+This environment variable sets the amount of dynamic shared memory in bytes used 
+by the kernel once it is launched. A pointer to the dynamic memory buffer can 
+currently only be accessed using the ``__kmpc_get_dynamic_shared`` device 
+runtime call.
+
  .. toctree::
     :hidden:
     :maxdepth: 1
diff --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h

index 11aa548..97e9449 100644 (file)
--- a/openmp/libomptarget/DeviceRTL/include/Configuration.h
+++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h
@@ -31,6 +31,9 @@ uint32_t getDeviceNum();
  /// Return the user choosen debug level.
  uint32_t getDebugLevel();
  
+/// Return the amount of dynamic shared memory that was allocated at launch.
+uint64_t getDynamicMemorySize();
+
  bool isDebugMode(DebugLevel Level);
  
  } // namespace config
diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h

index 9ed396d..1a8471a 100644 (file)
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -174,6 +174,10 @@ void *__kmpc_alloc_shared(uint64_t Bytes);
  /// allocated by __kmpc_alloc_shared by the same thread.
  void __kmpc_free_shared(void *Ptr, uint64_t Bytes);
  
+/// Get a pointer to the memory buffer containing dynamically allocated shared
+/// memory configured at launch.
+void *__kmpc_get_dynamic_shared();
+
  /// Allocate sufficient space for \p NumArgs sequential `void*` and store the
  /// allocation address in \p GlobalArgs.
  ///
diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h

index 63e0923..c860bd1 100644 (file)
--- a/openmp/libomptarget/DeviceRTL/include/State.h
+++ b/openmp/libomptarget/DeviceRTL/include/State.h
@@ -188,6 +188,9 @@ void freeShared(void *Ptr, uint64_t Bytes, const char *Reason);
  /// Alloca \p Size bytes in global memory, if possible, for \p Reason.
  void *allocGlobal(uint64_t Size, const char *Reason);
  
+/// Return a pointer to the dynamic shared memory buffer.
+void *getDynamicBuffer();
+
  /// Free \p Ptr, alloated via allocGlobal, for \p Reason.
  void freeGlobal(void *Ptr, const char *Reason);
  
diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp

index 4e48599..dc30707 100644 (file)
--- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -21,6 +21,7 @@ struct DeviceEnvironmentTy {
    uint32_t DebugLevel;
    uint32_t NumDevices;
    uint32_t DeviceNum;
+  uint64_t DynamicMemSize;
  };
  
  #pragma omp declare target
@@ -43,6 +44,10 @@ uint32_t config::getDeviceNum() {
    return omptarget_device_environment.DeviceNum;
  }
  
+uint64_t config::getDynamicMemorySize() {
+  return omptarget_device_environment.DynamicMemSize;
+}
+
  bool config::isDebugMode(config::DebugLevel Level) {
    return config::getDebugLevel() > Level;
  }
diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp

index 2e00a6e..e336155 100644 (file)
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -25,6 +25,13 @@ using namespace _OMP;
  ///
  ///{
  
+/// Add worst-case padding so that future allocations are properly aligned.
+constexpr const uint32_t Alignment = 8;
+
+/// External symbol to access dynamic shared memory.
+extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
+#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
+
  namespace {
  
  /// Fallback implementations are missing to trigger a link time error.
@@ -57,9 +64,6 @@ void free(void *Ptr) {}
  #pragma omp end declare variant
  ///}
  
-/// Add worst-case padding so that future allocations are properly aligned.
-constexpr const uint32_t Alignment = 8;
-
  /// A "smart" stack in shared memory.
  ///
  /// The stack exposes a malloc/free interface but works like a stack internally.
@@ -147,6 +151,8 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
  
  } // namespace
  
+void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
+
  void *memory::allocShared(uint64_t Bytes, const char *Reason) {
    return SharedMemorySmartStack.push(Bytes);
  }
@@ -497,6 +503,10 @@ __attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
    memory::freeShared(Ptr, Bytes, "Frontend free shared");
  }
  
+__attribute__((noinline)) void *__kmpc_get_dynamic_shared() {
+  return memory::getDynamicBuffer();
+}
+
  /// Allocate storage in shared memory to communicate arguments from the main
  /// thread to the workers in generic mode. If we exceed
  /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp

index f3a810c..aaff0d3 100644 (file)
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -103,6 +103,7 @@ struct omptarget_device_environmentTy {
    int32_t debug_level;
    uint32_t num_devices;
    uint32_t device_num;
+  uint64_t dynamic_shared_size;
  };
  
  namespace {
@@ -346,6 +347,8 @@ class DeviceRTLTy {
    int EnvTeamThreadLimit;
    // OpenMP requires flags
    int64_t RequiresFlags;
+  // Amount of dynamic shared memory to use at launch.
+  uint64_t DynamicMemorySize;
  
    static constexpr const int HardTeamLimit = 1U << 16U; // 64k
    static constexpr const int HardThreadLimit = 1024;
@@ -499,7 +502,8 @@ public:
  
    DeviceRTLTy()
        : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
-        EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) {
+        EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
+        DynamicMemorySize(0) {
  
      DP("Start initializing CUDA\n");
  
@@ -540,6 +544,11 @@ public:
        EnvNumTeams = std::stoi(EnvStr);
        DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
      }
+    if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
+      // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
+      DynamicMemorySize = std::stoi(EnvStr);
+      DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE", DynamicMemorySize);
+    }
  
      StreamManager =
          std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
@@ -904,7 +913,7 @@ public:
        // TODO: The device ID used here is not the real device ID used by OpenMP.
        omptarget_device_environmentTy DeviceEnv{
            0, static_cast<uint32_t>(NumberOfDevices),
-          static_cast<uint32_t>(DeviceId)};
+          static_cast<uint32_t>(DeviceId), DynamicMemorySize};
  
  #ifdef OMPTARGET_DEBUG
        if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
@@ -1190,7 +1199,7 @@ public:
      Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
                           /* gridDimZ */ 1, CudaThreadsPerBlock,
                           /* blockDimY */ 1, /* blockDimZ */ 1,
-                         /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
+                         DynamicMemorySize, Stream, &Args[0], nullptr);
      if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
        return OFFLOAD_FAIL;
  
diff --git a/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c

new file mode 100644 (file)

index 0000000..9a74ed3
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
@@ -0,0 +1,31 @@
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -fopenmp-target-new-runtime
+// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=4 \
+// RUN:   %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda
+// REQUIRES: nvptx64-nvidia-cuda
+
+#include <omp.h>
+#include <stdio.h>
+
+void *get_dynamic_shared() { return NULL; }
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+extern void *__kmpc_get_dynamic_shared();
+void *get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
+#pragma omp end declare variant
+
+int main() {
+  int x;
+#pragma omp target parallel map(from : x)
+  {
+    int *buf = get_dynamic_shared();
+#pragma omp barrier
+    if (omp_get_thread_num() == 0)
+      *buf = 1;
+#pragma omp barrier
+    if (omp_get_thread_num() == 1)
+      x = *buf;
+  }
+
+  // CHECK: PASS
+  if (x == 1)
+    printf("PASS\n");
+}
author	Joseph Huber <jhuber6@vols.utk.edu>
	Fri, 17 Sep 2021 18:53:58 +0000 (14:53 -0400)
committer	Joseph Huber <jhuber6@vols.utk.edu>
	Sat, 18 Sep 2021 01:25:36 +0000 (21:25 -0400)
openmp/docs/design/Runtimes.rst		patch \| blob \| history
openmp/libomptarget/DeviceRTL/include/Configuration.h		patch \| blob \| history
openmp/libomptarget/DeviceRTL/include/Interface.h		patch \| blob \| history
openmp/libomptarget/DeviceRTL/include/State.h		patch \| blob \| history
openmp/libomptarget/DeviceRTL/src/Configuration.cpp		patch \| blob \| history
openmp/libomptarget/DeviceRTL/src/State.cpp		patch \| blob \| history
openmp/libomptarget/plugins/cuda/src/rtl.cpp		patch \| blob \| history
openmp/libomptarget/test/api/omp_dynamic_shared_memory.c	[new file with mode: 0644]	patch \| blob