From: Gheorghe-Teodor Bercea Date: Wed, 21 Mar 2018 20:34:19 +0000 (+0000) Subject: [OpenMP][libomptarget] Enable globalization for workers X-Git-Tag: llvmorg-7.0.0-rc1~10003 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c8d395a168f141ceaec5705899a9aa5d13422085;p=platform%2Fupstream%2Fllvm.git [OpenMP][libomptarget] Enable globalization for workers Summary: This patch allows worker to have a global memory stack managed by the runtime. This patch is needed for completeness and consistency with the globalization policy: if a worker-side variable escapes the current context it then needs to be globalized. Until now, only the master thread was allowed to have such a stack. These global values can now potentially be shared amongst workers if the semantics of the OpenMP program require it. Reviewers: ABataev, grokos, carlo.bertolli, caomhin Reviewed By: grokos Subscribers: guansong, openmp-commits Differential Revision: https://reviews.llvm.org/D44487 llvm-svn: 328144 --- diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu index e739ca9..aa97c00 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -19,6 +19,8 @@ __device__ static unsigned getNumThreads() { return blockDim.x; } __device__ static unsigned getThreadId() { return threadIdx.x; } // Warp ID in the CUDA block __device__ static unsigned getWarpId() { return threadIdx.x / WARPSIZE; } +// Lane ID in the CUDA warp. +__device__ static unsigned getLaneId() { return threadIdx.x % WARPSIZE; } // The CUDA thread ID of the master thread. __device__ static unsigned getMasterThreadId() { @@ -359,26 +361,36 @@ EXTERN void __kmpc_data_sharing_init_stack() { // Called by: master, TODO: call by workers EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, int16_t UseSharedMemory) { - if (IsMasterThread()) { - unsigned WID = getWarpId(); + // Frame pointer must be visible to all workers in the same warp. + unsigned WID = getWarpId(); + void *&FrameP = DataSharingState.FramePtr[WID]; + // Only warp active master threads manage the stack. + if (IsWarpMasterActiveThread()) { // SlotP will point to either the shared memory slot or an existing // global memory slot. __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; __kmpc_data_sharing_slot *&TailSlotP = DataSharingState.TailPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; - void *FrameP = 0; + + // Compute the total memory footprint of the requested data. + // The master thread requires a stack only for itself. A worker + // thread (which at this point is a warp master) will require + // space for the variables of each thread in the warp, + // i.e. one DataSize chunk per warp lane. + // TODO: change WARPSIZE to the number of active threads in the warp. + size_t PushSize = IsMasterThread() ? DataSize : WARPSIZE * DataSize; // Check if we have room for the data in the current slot. const uintptr_t StartAddress = (uintptr_t)StackP; const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd; - const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)DataSize; + const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize; // If we requested more data than there is room for in the rest // of the slot then we need to either re-use the next slot, if one exists, // or create a new slot. if (EndAddress < RequestedEndAddress) { - size_t NewSize = DataSize; + size_t NewSize = PushSize; // The new or reused slot for holding the data being pushed. __kmpc_data_sharing_slot *NewSlot = 0; @@ -411,11 +423,11 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, } if (!NewSlot) { - // Allocate at least the default size. - // TODO: generalize this for workers which need a larger data slot - // i.e. using DS_Worker_Warp_Slot_Size. - if (DS_Slot_Size > DataSize) - NewSize = DS_Slot_Size; + // Allocate at least the default size for each type of slot. + size_t DefaultSlotSize = + IsMasterThread() ? DS_Slot_Size : DS_Worker_Warp_Slot_Size; + if (DefaultSlotSize > NewSize) + NewSize = DefaultSlotSize; NewSlot = (__kmpc_data_sharing_slot *)malloc( sizeof(__kmpc_data_sharing_slot) + NewSize); NewSlot->Next = 0; @@ -433,7 +445,7 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, // The current slot becomes the new slot. SlotP = NewSlot; // The stack pointer always points to the next free stack frame. - StackP = &NewSlot->Data[DataSize]; + StackP = &NewSlot->Data[PushSize]; // The frame pointer always points to the beginning of the frame. FrameP = &NewSlot->Data[0]; } else { @@ -443,16 +455,14 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, // Reset stack pointer to the requested address. StackP = (void *)RequestedEndAddress; } - - return FrameP; } - // TODO: add memory fence here when this function can be called by - // worker threads also. For now, this function is only called by the - // master thread of each team. + __threadfence_block(); - // TODO: implement sharing across workers. - return 0; + // Compute the start address of the frame of each thread in the warp. + uintptr_t FrameStartAddress = (uintptr_t)FrameP; + FrameStartAddress += (uintptr_t) (getLaneId() * DataSize); + return (void *)FrameStartAddress; } // Pop the stack and free any memory which can be reclaimed. @@ -461,12 +471,15 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, // reclaim all outstanding global memory slots since it is // likely we have reached the end of the kernel. EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { - if (IsMasterThread()) { + if (IsWarpMasterActiveThread()) { unsigned WID = getWarpId(); __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; + // Pop current frame from slot. + StackP = FrameStart; + // If we try to pop the last frame of the current slot we need to // move to the previous slot if there is one. const uintptr_t StartAddress = (uintptr_t)FrameStart; @@ -486,27 +499,16 @@ EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { // de-allocate any existing global memory slots. if (!SlotP->Prev) { __kmpc_data_sharing_slot *Tail = DataSharingState.TailPtr[WID]; - - while(Tail && Tail->Prev) { + while(Tail->Prev) { Tail = Tail->Prev; free(Tail->Next); - Tail->Next=0; } + Tail->Next=0; } - } else { - // This is not the last frame popped from this slot. - // Reset StackP - StackP = FrameStart; } - - return; } - // TODO: add memory fence here when this function can be called by - // worker threads also. For now, this function is only called by the - // master thread of each team. - - // TODO: implement sharing across workers. + __threadfence_block(); } // Begin a data sharing context. Maintain a list of references to shared