return()
endif()
+if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
+ libomptarget_say("Not building device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
+ return()
+endif()
+
+
# Check if we can create an LLVM bitcode implementation of the runtime library
# that could be inlined in the user application. For that we need to find
# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
set(clang_opt_flags -O1 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=2048)
set(link_opt_flags -O1 -openmp-opt-disable)
+# Prepend -I to each list element
+set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
+list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL PREPEND "-I")
+
# Set flags for LLVM Bitcode compilation.
set(bc_flags -S -x c++ -std=c++17
${clang_opt_flags}
-fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
-Xclang -target-feature -Xclang +ptx61
-I${include_directory}
+ ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
)
if(${LIBOMPTARGET_DEVICE_DEBUG})
#pragma omp declare target
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
+
using namespace _OMP;
namespace _OMP {
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
+constexpr const llvm::omp::GV &getGridValue() {
+ return llvm::omp::AMDGPUGridValues;
+}
+
uint32_t getGridDim(uint32_t n, uint16_t d) {
uint32_t q = n / d;
return q + (n > q * d);
return mapping::getThreadIdInBlock() / mapping::getWarpSize();
}
-uint32_t getWarpSize() { return 64; }
-
uint32_t getNumberOfWarpsInBlock() {
return mapping::getBlockSize() / mapping::getWarpSize();
}
#pragma omp begin declare variant match( \
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+constexpr const llvm::omp::GV &getGridValue() {
+ return llvm::omp::NVPTXGridValues;
+}
+
LaneMaskTy activemask() {
unsigned int Mask;
asm("activemask.b32 %0;" : "=r"(Mask));
return mapping::getThreadIdInBlock() / mapping::getWarpSize();
}
-uint32_t getWarpSize() { return 32; }
-
uint32_t getNumberOfWarpsInBlock() {
return (mapping::getBlockSize() + mapping::getWarpSize() - 1) /
mapping::getWarpSize();
#pragma omp end declare variant
///}
+uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
+
} // namespace impl
} // namespace _OMP
return()
endif()
+if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
+ libomptarget_say("Not building AMDGCN device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
+ return()
+endif()
+
+
# Copied from nvptx CMakeLists
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
set(aux_triple x86_64-unknown-linux-gnu)
set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})
endif()
+# Prepend -I to each list element
+set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
+list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN PREPEND "-I")
+
macro(add_cuda_bc_library)
set(cu_cmd ${CLANG_TOOL}
-xc++
${CUDA_DEBUG}
-I${CMAKE_CURRENT_SOURCE_DIR}/src
-I${devicertl_base_directory}/common/include
- -I${devicertl_base_directory})
+ -I${devicertl_base_directory}
+ ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN})
set(bc1_files)
#define NOINLINE __attribute__((noinline))
#define ALIGN(N) __attribute__((aligned(N)))
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
+
+INLINE constexpr const llvm::omp::GV &getGridValue() {
+ return llvm::omp::AMDGPUGridValues;
+}
+
////////////////////////////////////////////////////////////////////////////////
// Kernel options
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// The following def must match the absolute limit hardwired in the host RTL
// max number of threads per team
-#define MAX_THREADS_PER_TEAM 1024
-
-#define WARPSIZE 64
+enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
+enum { WARPSIZE = getGridValue().GV_Warp_Size };
// Maximum number of omp state objects per SM allocated statically in global
// memory.
// Data sharing related quantities, need to match what is used in the compiler.
enum DATA_SHARING_SIZES {
// The size reserved for data in a shared memory slot.
- DS_Slot_Size = 256,
+ DS_Slot_Size = getGridValue().GV_Slot_Size,
// The slot size that should be reserved for a working warp.
- DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
+ DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),
// The maximum number of warps in use
- DS_Max_Warp_Number = 16,
+ DS_Max_Warp_Number = getGridValue().maxWarpNumber(),
};
enum : __kmpc_impl_lanemask_t {
return()
endif()
+if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
+ libomptarget_say("Not building NVPTX device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
+ return()
+endif()
+
# Check if we can create an LLVM bitcode implementation of the runtime library
# that could be inlined in the user application. For that we need to find
# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
src/target_impl.cu
)
+# Prepend -I to each list element
+set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
+list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX PREPEND "-I")
+
# Set flags for LLVM Bitcode compilation.
set(bc_flags -S -x c++ -O1 -std=c++14
-mllvm -openmp-opt-disable
-D__CUDACC__
-I${devicertl_base_directory}
-I${devicertl_common_directory}/include
- -I${devicertl_nvptx_directory}/src)
+ -I${devicertl_nvptx_directory}/src
+ ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX})
if(${LIBOMPTARGET_NVPTX_DEBUG})
list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1 -g)
#define NOINLINE __attribute__((noinline))
#define ALIGN(N) __attribute__((aligned(N)))
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
+
+INLINE constexpr const llvm::omp::GV &getGridValue() {
+ return llvm::omp::NVPTXGridValues;
+}
+
////////////////////////////////////////////////////////////////////////////////
// Kernel options
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// The following def must match the absolute limit hardwired in the host RTL
// max number of threads per team
-#define MAX_THREADS_PER_TEAM 1024
-
-#define WARPSIZE 32
+enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
+enum { WARPSIZE = getGridValue().GV_Warp_Size };
// Maximum number of omp state objects per SM allocated statically in global
// memory.
// Data sharing related quantities, need to match what is used in the compiler.
enum DATA_SHARING_SIZES {
// The size reserved for data in a shared memory slot.
- DS_Slot_Size = 256,
+ DS_Slot_Size = getGridValue().GV_Slot_Size,
// The slot size that should be reserved for a working warp.
- DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
+ DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),
// The maximum number of warps in use
- DS_Max_Warp_Number = 32,
+ DS_Max_Warp_Number = getGridValue().maxWarpNumber(),
};
enum : __kmpc_impl_lanemask_t {