}
INLINE void* data_sharing_push_stack_common(size_t PushSize) {
- if (isRuntimeUninitialized()) {
- ASSERT0(LT_FUSSY, isSPMDMode(),
- "Expected SPMD mode with uninitialized runtime.");
- return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize);
- }
+ ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
// Only warp active master threads manage the stack.
bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0;
// reclaim all outstanding global memory slots since it is
// likely we have reached the end of the kernel.
EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
- if (isRuntimeUninitialized()) {
- ASSERT0(LT_FUSSY, isSPMDMode(),
- "Expected SPMD mode with uninitialized runtime.");
- return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart);
- }
+ ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
__threadfence_block();
EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
*GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
}
+
+// This function is used to init static memory manager. This manager is used to
+// manage statically allocated global memory. This memory is allocated by the
+// compiler and used to correctly implement globalization of the variables in
+// target, teams and distribute regions.
+EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size,
+ int16_t is_shared,
+ const void **frame) {
+ if (is_shared) {
+ *frame = buf;
+ return;
+ }
+ if (isSPMDMode()) {
+ if (GetThreadIdInBlock() == 0) {
+ *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
+ }
+ __syncthreads();
+ return;
+ }
+ ASSERT0(LT_FUSSY, GetThreadIdInBlock() == getMasterThreadId(),
+ "Must be called only in the target master thread.");
+ *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
+ __threadfence();
+}
+
+EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared) {
+ if (is_shared)
+ return;
+ if (isSPMDMode()) {
+ __syncthreads();
+ if (GetThreadIdInBlock() == 0) {
+ omptarget_nvptx_simpleMemoryManager.Release();
+ }
+ return;
+ }
+ __threadfence();
+ ASSERT0(LT_FUSSY, GetThreadIdInBlock() == getMasterThreadId(),
+ "Must be called only in the target master thread.");
+ omptarget_nvptx_simpleMemoryManager.Release();
+}
+
// SPMD execution mode interrogation function.
EXTERN int8_t __kmpc_is_spmd_exec_mode();
+
+EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size,
+ int16_t is_shared, const void **res);
+
+EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared);
+
#endif
OMP_STATE_COUNT>
omptarget_nvptx_device_simpleState[MAX_SM];
+__device__ omptarget_nvptx_SimpleMemoryManager
+ omptarget_nvptx_simpleMemoryManager;
+__device__ __shared__ uint32_t usedMemIdx;
+__device__ __shared__ uint32_t usedSlotIdx;
+
// Pointer to this team's OpenMP state object
__device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
__device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
*omptarget_nvptx_simpleThreadPrivateContext;
-__device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
-
////////////////////////////////////////////////////////////////////////////////
// The team master sets the outlined parallel function in this variable to
// communicate with the workers. Since it is in shared memory, there is one
omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT>
omptarget_nvptx_device_simpleState[MAX_SM];
-extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
-
////////////////////////////////////////////////////////////////////////////////
// init entry points
////////////////////////////////////////////////////////////////////////////////
-INLINE unsigned nsmid() {
- unsigned n;
- asm("mov.u32 %0, %%nsmid;" : "=r"(n));
- return n;
-}
-
INLINE unsigned smid() {
unsigned id;
asm("mov.u32 %0, %%smid;" : "=r"(id));
// Get a state object from the queue.
int slot = smid() % MAX_SM;
+ usedSlotIdx = slot;
omptarget_nvptx_threadPrivateContext =
omptarget_nvptx_device_State[slot].Dequeue();
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- omptarget_nvptx_threadPrivateContext->SetSourceQueue(slot);
-#endif
// init thread private
int threadId = GetLogicalThreadIdInBlock();
ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized,
"Generic always requires initialized runtime.");
// Enqueue omp state object for use by another team.
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- int slot = omptarget_nvptx_threadPrivateContext->GetSourceQueue();
-#else
- int slot = smid() % MAX_SM;
-#endif
+ int slot = usedSlotIdx;
omptarget_nvptx_device_State[slot].Enqueue(
omptarget_nvptx_threadPrivateContext);
// Done with work. Kill the workers.
setExecutionParameters(Spmd, RuntimeUninitialized);
if (GetThreadIdInBlock() == 0) {
int slot = smid() % MAX_SM;
+ usedSlotIdx = slot;
omptarget_nvptx_simpleThreadPrivateContext =
omptarget_nvptx_device_simpleState[slot].Dequeue();
- // Reuse the memory allocated for the full runtime as the preallocated
- // global memory buffer for the lightweight runtime.
- omptarget_nvptx_simpleGlobalData =
- omptarget_nvptx_device_State[slot].Dequeue();
}
__syncthreads();
omptarget_nvptx_simpleThreadPrivateContext->Init();
if (threadId == 0) {
// Get a state object from the queue.
int slot = smid() % MAX_SM;
+ usedSlotIdx = slot;
omptarget_nvptx_threadPrivateContext =
omptarget_nvptx_device_State[slot].Dequeue();
if (isRuntimeUninitialized()) {
if (threadId == 0) {
// Enqueue omp state object for use by another team.
- int slot = smid() % MAX_SM;
+ int slot = usedSlotIdx;
omptarget_nvptx_device_simpleState[slot].Enqueue(
omptarget_nvptx_simpleThreadPrivateContext);
- // Enqueue global memory back.
- omptarget_nvptx_device_State[slot].Enqueue(
- reinterpret_cast<omptarget_nvptx_ThreadPrivateContext *>(
- omptarget_nvptx_simpleGlobalData));
}
return;
}
if (threadId == 0) {
// Enqueue omp state object for use by another team.
- int slot = smid() % MAX_SM;
+ int slot = usedSlotIdx;
omptarget_nvptx_device_State[slot].Enqueue(
omptarget_nvptx_threadPrivateContext);
}
INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
INLINE void InitThreadPrivateContext(int tid);
- INLINE void SetSourceQueue(uint64_t Src) { SourceQueue = Src; }
- INLINE uint64_t GetSourceQueue() { return SourceQueue; }
private:
// team context for this team
// state for dispatch with dyn/guided OR static (never use both at a time)
int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
int64_t stride[MAX_THREADS_PER_TEAM];
- // Queue to which this object must be returned.
- uint64_t SourceQueue;
};
/// Device envrionment data
int32_t debug_level;
};
+/// Memory manager for statically allocated memory.
+class omptarget_nvptx_SimpleMemoryManager {
+private:
+ __align__(128) struct MemDataTy {
+ volatile unsigned keys[OMP_STATE_COUNT];
+ } MemData[MAX_SM];
+
+ INLINE uint32_t hash(unsigned key) const {
+ return key & (OMP_STATE_COUNT - 1);
+ }
+
+public:
+ INLINE void Release();
+ INLINE const void *Acquire(const void *buf, size_t size);
+};
+
class omptarget_nvptx_SimpleThreadPrivateContext {
uint16_t par_level[MAX_THREADS_PER_TEAM];
"Expected SPMD + uninitialized runtime modes.");
par_level[GetThreadIdInBlock()] = 0;
}
- static INLINE void *Allocate(size_t DataSize);
- static INLINE void Deallocate(void *Ptr);
INLINE void IncParLevel() {
ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
"Expected SPMD + uninitialized runtime modes.");
// global data tables
////////////////////////////////////////////////////////////////////////////////
+extern __device__ omptarget_nvptx_SimpleMemoryManager
+ omptarget_nvptx_simpleMemoryManager;
+extern __device__ __shared__ uint32_t usedMemIdx;
+extern __device__ __shared__ uint32_t usedSlotIdx;
extern __device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
extern __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
}
////////////////////////////////////////////////////////////////////////////////
-// Lightweight runtime functions.
+// Memory management runtime functions.
////////////////////////////////////////////////////////////////////////////////
-// Shared memory buffer for globalization support.
-static __align__(16) __device__ __shared__ char
- omptarget_static_buffer[DS_Shared_Memory_Size];
-static __device__ __shared__ void *omptarget_spmd_allocated;
-
-extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
-
-INLINE void *
-omptarget_nvptx_SimpleThreadPrivateContext::Allocate(size_t DataSize) {
- if (DataSize <= DS_Shared_Memory_Size)
- return ::omptarget_static_buffer;
- if (DataSize <= sizeof(omptarget_nvptx_ThreadPrivateContext))
- return ::omptarget_nvptx_simpleGlobalData;
- if (threadIdx.x == 0)
- omptarget_spmd_allocated = SafeMalloc(DataSize, "SPMD teams alloc");
- __syncthreads();
- return omptarget_spmd_allocated;
-}
-
-INLINE void
-omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(void *Ptr) {
- if (Ptr != ::omptarget_static_buffer &&
- Ptr != ::omptarget_nvptx_simpleGlobalData) {
- __syncthreads();
- if (threadIdx.x == 0)
- SafeFree(Ptr, "SPMD teams dealloc");
+INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
+ ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
+ "SlotIdx is too big or uninitialized.");
+ ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
+ "MemIdx is too big or uninitialized.");
+ MemDataTy &MD = MemData[usedSlotIdx];
+ atomicExch((unsigned *)&MD.keys[usedMemIdx], 0);
+}
+
+INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
+ size_t size) {
+ ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
+ "SlotIdx is too big or uninitialized.");
+ const unsigned sm = usedSlotIdx;
+ MemDataTy &MD = MemData[sm];
+ unsigned i = hash(GetBlockIdInKernel());
+ while (atomicCAS((unsigned *)&MD.keys[i], 0, 1) != 0) {
+ i = hash(i + 1);
}
+ usedSlotIdx = sm;
+ usedMemIdx = i;
+ return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size;
}