[openmp] Use llvm GridValues from devicertl

author Jon Chesterfield <jonathanchesterfield@gmail.com>

Mon, 23 Aug 2021 19:25:23 +0000 (20:25 +0100)

committer Jon Chesterfield <jonathanchesterfield@gmail.com>

Mon, 23 Aug 2021 19:25:24 +0000 (20:25 +0100)
author Jon Chesterfield <jonathanchesterfield@gmail.com>
Mon, 23 Aug 2021 19:25:23 +0000 (20:25 +0100)
committer Jon Chesterfield <jonathanchesterfield@gmail.com>
Mon, 23 Aug 2021 19:25:24 +0000 (20:25 +0100)
diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt

index 148dad2..75efec3 100644 (file)
--- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -21,6 +21,12 @@ if (NOT (LIBOMPTARGET_DEP_CUDA_FOUND OR LIBOMPTARGET_BUILD_NVPTX_BCLIB))
    return()
  endif()
  
+if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
+  libomptarget_say("Not building device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
+  return()
+endif()
+
+
  # Check if we can create an LLVM bitcode implementation of the runtime library
  # that could be inlined in the user application. For that we need to find
  # a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
@@ -132,6 +138,10 @@ set(src_files
  set(clang_opt_flags -O1 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=2048)
  set(link_opt_flags  -O1        -openmp-opt-disable)
  
+# Prepend -I to each list element
+set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
+list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL PREPEND "-I")
+
  # Set flags for LLVM Bitcode compilation.
  set(bc_flags -S -x c++ -std=c++17
                ${clang_opt_flags}
@@ -141,6 +151,7 @@ set(bc_flags -S -x c++ -std=c++17
               -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
               -Xclang -target-feature -Xclang +ptx61
               -I${include_directory}
+             ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
  )
  
  if(${LIBOMPTARGET_DEVICE_DEBUG})
diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp

index fc3ca63..3bd0553 100644 (file)
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -16,6 +16,8 @@
  
  #pragma omp declare target
  
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
+
  using namespace _OMP;
  
  namespace _OMP {
@@ -26,6 +28,10 @@ namespace impl {
  ///{
  #pragma omp begin declare variant match(device = {arch(amdgcn)})
  
+constexpr const llvm::omp::GV &getGridValue() {
+  return llvm::omp::AMDGPUGridValues;
+}
+
  uint32_t getGridDim(uint32_t n, uint16_t d) {
    uint32_t q = n / d;
    return q + (n > q * d);
@@ -86,8 +92,6 @@ uint32_t getWarpId() {
    return mapping::getThreadIdInBlock() / mapping::getWarpSize();
  }
  
-uint32_t getWarpSize() { return 64; }
-
  uint32_t getNumberOfWarpsInBlock() {
    return mapping::getBlockSize() / mapping::getWarpSize();
  }
@@ -101,6 +105,10 @@ uint32_t getNumberOfWarpsInBlock() {
  #pragma omp begin declare variant match(                                       \
      device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
  
+constexpr const llvm::omp::GV &getGridValue() {
+  return llvm::omp::NVPTXGridValues;
+}
+
  LaneMaskTy activemask() {
    unsigned int Mask;
    asm("activemask.b32 %0;" : "=r"(Mask));
@@ -144,8 +152,6 @@ uint32_t getWarpId() {
    return mapping::getThreadIdInBlock() / mapping::getWarpSize();
  }
  
-uint32_t getWarpSize() { return 32; }
-
  uint32_t getNumberOfWarpsInBlock() {
    return (mapping::getBlockSize() + mapping::getWarpSize() - 1) /
           mapping::getWarpSize();
@@ -154,6 +160,8 @@ uint32_t getNumberOfWarpsInBlock() {
  #pragma omp end declare variant
  ///}
  
+uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
+
  } // namespace impl
  } // namespace _OMP
  
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt

index 6753b97..5d559f6 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
@@ -18,6 +18,12 @@ if (NOT LIBOMPTARGET_BUILD_AMDGCN_BCLIB)
    return()
  endif()
  
+if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
+  libomptarget_say("Not building AMDGCN device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
+  return()
+endif()
+
+
  # Copied from nvptx CMakeLists
  if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
    set(aux_triple x86_64-unknown-linux-gnu)
@@ -103,6 +109,10 @@ if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)
    set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})
  endif()
  
+# Prepend -I to each list element
+set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
+list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN PREPEND "-I")
+
  macro(add_cuda_bc_library)
    set(cu_cmd ${CLANG_TOOL}
      -xc++
@@ -123,7 +133,8 @@ macro(add_cuda_bc_library)
      ${CUDA_DEBUG}
      -I${CMAKE_CURRENT_SOURCE_DIR}/src
      -I${devicertl_base_directory}/common/include
-    -I${devicertl_base_directory})
+    -I${devicertl_base_directory}
+    ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN})
  
    set(bc1_files)
  
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h

index 830911d..6921f9b 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -31,6 +31,12 @@ typedef uint64_t __kmpc_impl_lanemask_t;
  #define NOINLINE __attribute__((noinline))
  #define ALIGN(N) __attribute__((aligned(N)))
  
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
+
+INLINE constexpr const llvm::omp::GV &getGridValue() {
+  return llvm::omp::AMDGPUGridValues;
+}
+
  ////////////////////////////////////////////////////////////////////////////////
  // Kernel options
  ////////////////////////////////////////////////////////////////////////////////
@@ -38,9 +44,8 @@ typedef uint64_t __kmpc_impl_lanemask_t;
  ////////////////////////////////////////////////////////////////////////////////
  // The following def must match the absolute limit hardwired in the host RTL
  // max number of threads per team
-#define MAX_THREADS_PER_TEAM 1024
-
-#define WARPSIZE 64
+enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
+enum { WARPSIZE = getGridValue().GV_Warp_Size };
  
  // Maximum number of omp state objects per SM allocated statically in global
  // memory.
@@ -52,11 +57,11 @@ typedef uint64_t __kmpc_impl_lanemask_t;
  // Data sharing related quantities, need to match what is used in the compiler.
  enum DATA_SHARING_SIZES {
    // The size reserved for data in a shared memory slot.
-  DS_Slot_Size = 256,
+  DS_Slot_Size = getGridValue().GV_Slot_Size,
    // The slot size that should be reserved for a working warp.
-  DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
+  DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),
    // The maximum number of warps in use
-  DS_Max_Warp_Number = 16,
+  DS_Max_Warp_Number = getGridValue().maxWarpNumber(),
  };
  
  enum : __kmpc_impl_lanemask_t {
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt

index a07f4f7..68d93ce 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -19,6 +19,11 @@ if (NOT (LIBOMPTARGET_DEP_CUDA_FOUND OR LIBOMPTARGET_BUILD_NVPTX_BCLIB))
    return()
  endif()
  
+if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
+  libomptarget_say("Not building NVPTX device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
+  return()
+endif()
+
  # Check if we can create an LLVM bitcode implementation of the runtime library
  # that could be inlined in the user application. For that we need to find
  # a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
@@ -151,6 +156,10 @@ set(cuda_src_files
    src/target_impl.cu
  )
  
+# Prepend -I to each list element
+set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
+list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX PREPEND "-I")
+
  # Set flags for LLVM Bitcode compilation.
  set(bc_flags -S -x c++ -O1 -std=c++14
               -mllvm -openmp-opt-disable
@@ -162,7 +171,8 @@ set(bc_flags -S -x c++ -O1 -std=c++14
               -D__CUDACC__
               -I${devicertl_base_directory}
               -I${devicertl_common_directory}/include
-             -I${devicertl_nvptx_directory}/src)
+             -I${devicertl_nvptx_directory}/src
+             ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX})
  
  if(${LIBOMPTARGET_NVPTX_DEBUG})
    list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1 -g)
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

index 196f623..4664e58 100644 (file)
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -24,6 +24,12 @@ typedef uint32_t __kmpc_impl_lanemask_t;
  #define NOINLINE __attribute__((noinline))
  #define ALIGN(N) __attribute__((aligned(N)))
  
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
+
+INLINE constexpr const llvm::omp::GV &getGridValue() {
+  return llvm::omp::NVPTXGridValues;
+}
+
  ////////////////////////////////////////////////////////////////////////////////
  // Kernel options
  ////////////////////////////////////////////////////////////////////////////////
@@ -31,9 +37,8 @@ typedef uint32_t __kmpc_impl_lanemask_t;
  ////////////////////////////////////////////////////////////////////////////////
  // The following def must match the absolute limit hardwired in the host RTL
  // max number of threads per team
-#define MAX_THREADS_PER_TEAM 1024
-
-#define WARPSIZE 32
+enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
+enum { WARPSIZE = getGridValue().GV_Warp_Size };
  
  // Maximum number of omp state objects per SM allocated statically in global
  // memory.
@@ -64,11 +69,11 @@ typedef uint32_t __kmpc_impl_lanemask_t;
  // Data sharing related quantities, need to match what is used in the compiler.
  enum DATA_SHARING_SIZES {
    // The size reserved for data in a shared memory slot.
-  DS_Slot_Size = 256,
+  DS_Slot_Size = getGridValue().GV_Slot_Size,
    // The slot size that should be reserved for a working warp.
-  DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
+  DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),
    // The maximum number of warps in use
-  DS_Max_Warp_Number = 32,
+  DS_Max_Warp_Number = getGridValue().maxWarpNumber(),
  };
  
  enum : __kmpc_impl_lanemask_t {
author	Jon Chesterfield <jonathanchesterfield@gmail.com>
	Mon, 23 Aug 2021 19:25:23 +0000 (20:25 +0100)
committer	Jon Chesterfield <jonathanchesterfield@gmail.com>
	Mon, 23 Aug 2021 19:25:24 +0000 (20:25 +0100)
openmp/libomptarget/DeviceRTL/CMakeLists.txt		patch \| blob \| history
openmp/libomptarget/DeviceRTL/src/Mapping.cpp		patch \| blob \| history
openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt		patch \| blob \| history
openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt		patch \| blob \| history
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h		patch \| blob \| history