* ``LIBOMPTARGET_INFO=<Num>``
* ``LIBOMPTARGET_HEAP_SIZE=<Num>``
* ``LIBOMPTARGET_STACK_SIZE=<Num>``
+ * ``LIBOMPTARGET_SHARED_MEMORY_SIZE=<Num>``
LIBOMPTARGET_DEBUG
""""""""""""""""""
for some applications that allocate too much memory either through the user or
globalization.
+LIBOMPTARGET_SHARED_MEMORY_SIZE
+"""""""""""""""""""""""""""""""
+
+This environment variable sets the amount of dynamic shared memory in bytes used
+by the kernel once it is launched. A pointer to the dynamic memory buffer can
+currently only be accessed using the ``__kmpc_get_dynamic_shared`` device
+runtime call.
+
.. toctree::
:hidden:
:maxdepth: 1
/// Return the user choosen debug level.
uint32_t getDebugLevel();
+/// Return the amount of dynamic shared memory that was allocated at launch.
+uint64_t getDynamicMemorySize();
+
bool isDebugMode(DebugLevel Level);
} // namespace config
/// allocated by __kmpc_alloc_shared by the same thread.
void __kmpc_free_shared(void *Ptr, uint64_t Bytes);
+/// Get a pointer to the memory buffer containing dynamically allocated shared
+/// memory configured at launch.
+void *__kmpc_get_dynamic_shared();
+
/// Allocate sufficient space for \p NumArgs sequential `void*` and store the
/// allocation address in \p GlobalArgs.
///
/// Alloca \p Size bytes in global memory, if possible, for \p Reason.
void *allocGlobal(uint64_t Size, const char *Reason);
+/// Return a pointer to the dynamic shared memory buffer.
+void *getDynamicBuffer();
+
/// Free \p Ptr, alloated via allocGlobal, for \p Reason.
void freeGlobal(void *Ptr, const char *Reason);
uint32_t DebugLevel;
uint32_t NumDevices;
uint32_t DeviceNum;
+ uint64_t DynamicMemSize;
};
#pragma omp declare target
return omptarget_device_environment.DeviceNum;
}
+uint64_t config::getDynamicMemorySize() {
+ return omptarget_device_environment.DynamicMemSize;
+}
+
bool config::isDebugMode(config::DebugLevel Level) {
return config::getDebugLevel() > Level;
}
///
///{
+/// Add worst-case padding so that future allocations are properly aligned.
+constexpr const uint32_t Alignment = 8;
+
+/// External symbol to access dynamic shared memory.
+extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
+#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
+
namespace {
/// Fallback implementations are missing to trigger a link time error.
#pragma omp end declare variant
///}
-/// Add worst-case padding so that future allocations are properly aligned.
-constexpr const uint32_t Alignment = 8;
-
/// A "smart" stack in shared memory.
///
/// The stack exposes a malloc/free interface but works like a stack internally.
} // namespace
+void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
+
void *memory::allocShared(uint64_t Bytes, const char *Reason) {
return SharedMemorySmartStack.push(Bytes);
}
memory::freeShared(Ptr, Bytes, "Frontend free shared");
}
+__attribute__((noinline)) void *__kmpc_get_dynamic_shared() {
+ return memory::getDynamicBuffer();
+}
+
/// Allocate storage in shared memory to communicate arguments from the main
/// thread to the workers in generic mode. If we exceed
/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
int32_t debug_level;
uint32_t num_devices;
uint32_t device_num;
+ uint64_t dynamic_shared_size;
};
namespace {
int EnvTeamThreadLimit;
// OpenMP requires flags
int64_t RequiresFlags;
+ // Amount of dynamic shared memory to use at launch.
+ uint64_t DynamicMemorySize;
static constexpr const int HardTeamLimit = 1U << 16U; // 64k
static constexpr const int HardThreadLimit = 1024;
DeviceRTLTy()
: NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
- EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) {
+ EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
+ DynamicMemorySize(0) {
DP("Start initializing CUDA\n");
EnvNumTeams = std::stoi(EnvStr);
DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
}
+ if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
+ // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
+ DynamicMemorySize = std::stoi(EnvStr);
+ DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE", DynamicMemorySize);
+ }
StreamManager =
std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
// TODO: The device ID used here is not the real device ID used by OpenMP.
omptarget_device_environmentTy DeviceEnv{
0, static_cast<uint32_t>(NumberOfDevices),
- static_cast<uint32_t>(DeviceId)};
+ static_cast<uint32_t>(DeviceId), DynamicMemorySize};
#ifdef OMPTARGET_DEBUG
if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
/* gridDimZ */ 1, CudaThreadsPerBlock,
/* blockDimY */ 1, /* blockDimZ */ 1,
- /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
+ DynamicMemorySize, Stream, &Args[0], nullptr);
if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
return OFFLOAD_FAIL;
--- /dev/null
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -fopenmp-target-new-runtime
+// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=4 \
+// RUN: %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda
+// REQUIRES: nvptx64-nvidia-cuda
+
+#include <omp.h>
+#include <stdio.h>
+
+void *get_dynamic_shared() { return NULL; }
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+extern void *__kmpc_get_dynamic_shared();
+void *get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
+#pragma omp end declare variant
+
+int main() {
+ int x;
+#pragma omp target parallel map(from : x)
+ {
+ int *buf = get_dynamic_shared();
+#pragma omp barrier
+ if (omp_get_thread_num() == 0)
+ *buf = 1;
+#pragma omp barrier
+ if (omp_get_thread_num() == 1)
+ x = *buf;
+ }
+
+ // CHECK: PASS
+ if (x == 1)
+ printf("PASS\n");
+}