src/omptarget-nvptx.cu
src/parallel.cu
src/reduction.cu
- src/support.cu
src/sync.cu
src/task.cu
)
set(BUILD_SHARED_LIBS OFF)
set(CUDA_SEPARABLE_COMPILATION ON)
list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory})
- cuda_add_library(omptarget-nvptx STATIC unity.cu
+ cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
# Install device RTL under the lib destination folder.
#include "target_impl.h"
#include <stdio.h>
+// Warp ID in the CUDA block
+INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
+// Lane ID in the CUDA warp.
+INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }
+
// Return true if this is the first active thread in the warp.
INLINE static bool IsWarpMasterActiveThread() {
unsigned long long Mask = __kmpc_impl_activemask();
DSPRINT0(DSFLAG_INIT,
"Entering __kmpc_initialize_data_sharing_environment\n");
- unsigned WID = GetWarpId();
+ unsigned WID = getWarpId();
DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
omptarget_nvptx_TeamDescr *teamDescr =
DSPRINT(DSFLAG, "Default Data Size %016llx\n",
(unsigned long long)SharingDefaultDataSize);
- unsigned WID = GetWarpId();
+ unsigned WID = getWarpId();
__kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask();
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
- unsigned WID = GetWarpId();
+ unsigned WID = getWarpId();
if (IsEntryPoint) {
if (IsWarpMasterActiveThread()) {
// This function initializes the stack pointer with the pointer to the
// statically allocated shared memory slots. The size of a shared memory
// slot is pre-determined to be 256 bytes.
- if (GetThreadIdInBlock() == 0)
+ if (threadIdx.x == 0)
data_sharing_init_stack_common();
__threadfence_block();
PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
// Frame pointer must be visible to all workers in the same warp.
- const unsigned WID = GetWarpId();
+ const unsigned WID = getWarpId();
void *FrameP = 0;
__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
// Compute the start address of the frame of each thread in the warp.
uintptr_t FrameStartAddress =
(uintptr_t) data_sharing_push_stack_common(PushSize);
- FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize);
+ FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
return (void *)FrameStartAddress;
}
__threadfence_block();
if (GetThreadIdInBlock() % WARPSIZE == 0) {
- unsigned WID = GetWarpId();
+ unsigned WID = getWarpId();
// Current slot
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
#include <stdio.h>
-#include "support.h"
+#include "target_impl.h"
template <typename... Arguments>
NOINLINE static void log(const char *fmt, Arguments... parameters) {
- printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
- (int)GetWarpId(), (int)GetLaneId(), parameters...);
+ printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE),
+ (int)(threadIdx.x & 0x1F), parameters...);
}
#endif
NOINLINE static void check(bool cond, const char *fmt,
Arguments... parameters) {
if (!cond)
- printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
- (int)GetWarpId(), (int)GetLaneId(), parameters...);
+ printf(fmt, (int)blockIdx.x, (int)threadIdx.x,
+ (int)(threadIdx.x / WARPSIZE), (int)(threadIdx.x & 0x1F),
+ parameters...);
assert(cond);
}
for (;;) {
now = clock();
clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
- if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
+ if (cycles >= __OMP_SPIN * blockIdx.x) {
break;
}
}
////////////////////////////////////////////////////////////////////////////////
#include "omptarget-nvptxi.h"
+#include "supporti.h"
#endif
//
//===----------------------------------------------------------------------===//
-#ifndef OMPTARGET_SUPPORT_H
-#define OMPTARGET_SUPPORT_H
-
-#include "interface.h"
#include "target_impl.h"
-
////////////////////////////////////////////////////////////////////////////////
// Execution Parameters
////////////////////////////////////////////////////////////////////////////////
RuntimeMask = 0x02u,
};
-DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
-DEVICE bool isGenericMode();
-DEVICE bool isSPMDMode();
-DEVICE bool isRuntimeUninitialized();
-DEVICE bool isRuntimeInitialized();
-
-////////////////////////////////////////////////////////////////////////////////
-// Execution Modes based on location parameter fields
-////////////////////////////////////////////////////////////////////////////////
-
-DEVICE bool checkSPMDMode(kmp_Ident *loc);
-
-DEVICE bool checkGenericMode(kmp_Ident *loc);
-
-DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc);
-
-DEVICE bool checkRuntimeInitialized(kmp_Ident *loc);
+INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
+INLINE bool isGenericMode();
+INLINE bool isSPMDMode();
+INLINE bool isRuntimeUninitialized();
+INLINE bool isRuntimeInitialized();
////////////////////////////////////////////////////////////////////////////////
// get info from machine
////////////////////////////////////////////////////////////////////////////////
// get low level ids of resources
-DEVICE int GetThreadIdInBlock();
-DEVICE int GetBlockIdInKernel();
-DEVICE int GetNumberOfBlocksInKernel();
-DEVICE int GetNumberOfThreadsInBlock();
-DEVICE unsigned GetWarpId();
-DEVICE unsigned GetLaneId();
+INLINE int GetThreadIdInBlock();
+INLINE int GetBlockIdInKernel();
+INLINE int GetNumberOfBlocksInKernel();
+INLINE int GetNumberOfThreadsInBlock();
+INLINE unsigned GetWarpId();
+INLINE unsigned GetLaneId();
// get global ids to locate tread/team info (constant regardless of OMP)
-DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
-DEVICE int GetMasterThreadID();
-DEVICE int GetNumberOfWorkersInTeam();
+INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
+INLINE int GetMasterThreadID();
+INLINE int GetNumberOfWorkersInTeam();
// get OpenMP thread and team ids
-DEVICE int GetOmpThreadId(int threadId,
+INLINE int GetOmpThreadId(int threadId,
bool isSPMDExecutionMode); // omp_thread_num
-DEVICE int GetOmpTeamId(); // omp_team_num
+INLINE int GetOmpTeamId(); // omp_team_num
// get OpenMP number of threads and team
-DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
-DEVICE int GetNumberOfOmpTeams(); // omp_num_teams
+INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
+INLINE int GetNumberOfOmpTeams(); // omp_num_teams
// get OpenMP number of procs
-DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
-DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
+INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
+INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
// masters
-DEVICE int IsTeamMaster(int ompThreadId);
+INLINE int IsTeamMaster(int ompThreadId);
// Parallel level
-DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
-DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
////////////////////////////////////////////////////////////////////////////////
// Memory
////////////////////////////////////////////////////////////////////////////////
// safe alloc and free
-DEVICE void *SafeMalloc(size_t size, const char *msg); // check if success
-DEVICE void *SafeFree(void *ptr, const char *msg);
+INLINE void *SafeMalloc(size_t size, const char *msg); // check if success
+INLINE void *SafeFree(void *ptr, const char *msg);
// pad to a alignment (power of 2 only)
-DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment);
+INLINE unsigned long PadBytes(unsigned long size, unsigned long alignment);
#define ADD_BYTES(_addr, _bytes) \
((void *)((char *)((void *)(_addr)) + (_bytes)))
#define SUB_BYTES(_addr, _bytes) \
////////////////////////////////////////////////////////////////////////////////
// Teams Reduction Scratchpad Helpers
////////////////////////////////////////////////////////////////////////////////
-DEVICE unsigned int *GetTeamsReductionTimestamp();
-DEVICE char *GetTeamsReductionScratchpad();
-DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
-
-#endif
+INLINE unsigned int *GetTeamsReductionTimestamp();
+INLINE char *GetTeamsReductionScratchpad();
+INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
-//===--------- support.cu - NVPTX OpenMP support functions ------- CUDA -*-===//
+//===--------- supporti.h - NVPTX OpenMP support functions ------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
-#include "support.h"
-#include "debug.h"
-#include "omptarget-nvptx.h"
-
////////////////////////////////////////////////////////////////////////////////
// Execution Parameters
////////////////////////////////////////////////////////////////////////////////
+#include "target_impl.h"
+
INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
execution_param = EMode;
execution_param |= RMode;
INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
-INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
+INLINE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; }
-INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
+INLINE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); }
////////////////////////////////////////////////////////////////////////////////
//
// If NumThreads is 1024, master id is 992.
//
// Called in Generic Execution Mode only.
-INLINE int GetMasterThreadID() {
- return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1);
-}
+INLINE int GetMasterThreadID() { return (blockDim.x - 1) & ~(WARPSIZE - 1); }
// The last warp is reserved for the master; other warps are workers.
// Called in Generic Execution Mode only.
#include <cuda.h>
#include "nvptx_interface.h"
-#define DEVICE __device__
-#define INLINE __forceinline__ DEVICE
-#define NOINLINE __noinline__ DEVICE
+#define INLINE __forceinline__ __device__
+#define NOINLINE __noinline__ __device__
////////////////////////////////////////////////////////////////////////////////
// Kernel options
+++ /dev/null
-//===------ unity.cu - Unity build of NVPTX deviceRTL ------------ CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Support compilers, specifically NVCC, which have not implemented link time
-// optimisation. This removes the runtime cost of moving inline functions into
-// source files in exchange for preventing efficient incremental builds.
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/cancel.cu"
-#include "src/critical.cu"
-#include "src/data_sharing.cu"
-#include "src/libcall.cu"
-#include "src/loop.cu"
-#include "src/omp_data.cu"
-#include "src/omptarget-nvptx.cu"
-#include "src/parallel.cu"
-#include "src/reduction.cu"
-#include "src/support.cu"
-#include "src/sync.cu"
-#include "src/task.cu"