From 9b06ac98d0818be3534abe6bc031bf8a40361363 Mon Sep 17 00:00:00 2001 From: JonChesterfield Date: Fri, 1 Nov 2019 02:21:12 +0000 Subject: [PATCH] [nfc][omptarget] Use builtin var abstraction. Second pass at D69476 Summary: [nfc][omptarget] Use builtin var abstraction. Second pass at D69476 Use the wrappers in support.h for cuda builtin variables at all call sites. Localises use of cuda and removes WARPSIZE==32 assumption in debug.h. Reviewers: ABataev, jdoerfert, grokos Reviewed By: jdoerfert Subscribers: openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D69693 --- .../libomptarget/deviceRTLs/nvptx/src/data_sharing.cu | 19 +++++++------------ openmp/libomptarget/deviceRTLs/nvptx/src/debug.h | 11 +++++------ openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu | 2 +- openmp/libomptarget/deviceRTLs/nvptx/src/support.cu | 8 +++++--- 4 files changed, 18 insertions(+), 22 deletions(-) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu index 78b04ec..f2892ac 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -13,11 +13,6 @@ #include "target_impl.h" #include -// Warp ID in the CUDA block -INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; } -// Lane ID in the CUDA warp. -INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; } - // Return true if this is the first active thread in the warp. INLINE static bool IsWarpMasterActiveThread() { unsigned long long Mask = __kmpc_impl_activemask(); @@ -67,7 +62,7 @@ __kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS, DSPRINT0(DSFLAG_INIT, "Entering __kmpc_initialize_data_sharing_environment\n"); - unsigned WID = getWarpId(); + unsigned WID = GetWarpId(); DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID); omptarget_nvptx_TeamDescr *teamDescr = @@ -111,7 +106,7 @@ EXTERN void *__kmpc_data_sharing_environment_begin( DSPRINT(DSFLAG, "Default Data Size %016llx\n", (unsigned long long)SharingDefaultDataSize); - unsigned WID = getWarpId(); + unsigned WID = GetWarpId(); __kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask(); __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; @@ -231,7 +226,7 @@ EXTERN void __kmpc_data_sharing_environment_end( DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n"); - unsigned WID = getWarpId(); + unsigned WID = GetWarpId(); if (IsEntryPoint) { if (IsWarpMasterActiveThread()) { @@ -359,7 +354,7 @@ EXTERN void __kmpc_data_sharing_init_stack_spmd() { // This function initializes the stack pointer with the pointer to the // statically allocated shared memory slots. The size of a shared memory // slot is pre-determined to be 256 bytes. - if (threadIdx.x == 0) + if (GetThreadIdInBlock() == 0) data_sharing_init_stack_common(); __threadfence_block(); @@ -377,7 +372,7 @@ INLINE static void* data_sharing_push_stack_common(size_t PushSize) { PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment; // Frame pointer must be visible to all workers in the same warp. - const unsigned WID = getWarpId(); + const unsigned WID = GetWarpId(); void *FrameP = 0; __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); @@ -467,7 +462,7 @@ EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize, // Compute the start address of the frame of each thread in the warp. uintptr_t FrameStartAddress = (uintptr_t) data_sharing_push_stack_common(PushSize); - FrameStartAddress += (uintptr_t) (getLaneId() * DataSize); + FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize); return (void *)FrameStartAddress; } @@ -482,7 +477,7 @@ EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { __threadfence_block(); if (GetThreadIdInBlock() % WARPSIZE == 0) { - unsigned WID = getWarpId(); + unsigned WID = GetWarpId(); // Current slot __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h index 1052392..1f66057 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h @@ -128,12 +128,12 @@ #if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING #include -#include "target_impl.h" +#include "support.h" template NOINLINE static void log(const char *fmt, Arguments... parameters) { - printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), - (int)(threadIdx.x & 0x1F), parameters...); + printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(), + (int)GetWarpId(), (int)GetLaneId, parameters...); } #endif @@ -144,9 +144,8 @@ template NOINLINE static void check(bool cond, const char *fmt, Arguments... parameters) { if (!cond) - printf(fmt, (int)blockIdx.x, (int)threadIdx.x, - (int)(threadIdx.x / WARPSIZE), (int)(threadIdx.x & 0x1F), - parameters...); + printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(), + (int)GetWarpId(), (int)GetLaneId, parameters...); assert(cond); } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu index 20a22f4..e86b1d1 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -364,7 +364,7 @@ EXTERN void omp_set_lock(omp_lock_t *lock) { for (;;) { now = clock(); clock_t cycles = now > start ? now - start : now + (0xffffffff - start); - if (cycles >= __OMP_SPIN * blockIdx.x) { + if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) { break; } } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu index 2b89d8d..2767597 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu @@ -106,9 +106,9 @@ INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; } INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; } -INLINE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; } +INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } -INLINE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); } +INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } //////////////////////////////////////////////////////////////////////////////// // @@ -124,7 +124,9 @@ INLINE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); } // If NumThreads is 1024, master id is 992. // // Called in Generic Execution Mode only. -INLINE int GetMasterThreadID() { return (blockDim.x - 1) & ~(WARPSIZE - 1); } +INLINE int GetMasterThreadID() { + return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1); +} // The last warp is reserved for the master; other warps are workers. // Called in Generic Execution Mode only. -- 2.7.4