[OpenMP][Offloading] Use bitset to indicate execution mode instead of value

author Shilei Tian <tianshilei1992@gmail.com>

Wed, 22 Sep 2021 15:40:37 +0000 (11:40 -0400)

committer Shilei Tian <tianshilei1992@gmail.com>

Wed, 22 Sep 2021 15:40:52 +0000 (11:40 -0400)
author Shilei Tian <tianshilei1992@gmail.com>
Wed, 22 Sep 2021 15:40:37 +0000 (11:40 -0400)
committer Shilei Tian <tianshilei1992@gmail.com>
Wed, 22 Sep 2021 15:40:52 +0000 (11:40 -0400)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

index 9d28b42..16f1b0b 100644 (file)
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1112,11 +1112,12 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
  // warps participate in parallel work.
  static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
                                       bool Mode) {
-  auto *GVMode =
-      new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
-                               llvm::GlobalValue::WeakAnyLinkage,
-                               llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
-                               Twine(Name, "_exec_mode"));
+  auto *GVMode = new llvm::GlobalVariable(
+      CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
+      llvm::GlobalValue::WeakAnyLinkage,
+      llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
+                                              : OMP_TGT_EXEC_MODE_GENERIC),
+      Twine(Name, "_exec_mode"));
    CGM.addCompilerUsedGlobal(GVMode);
  }
  
diff --git a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp

index 8eaff64..71d8d77 100644 (file)
--- a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
@@ -16,9 +16,9 @@
  #define HEADER
  
  // Check that the execution mode of all 3 target regions on the gpu is set to SPMD Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l29}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l29}}_exec_mode = weak constant i8 2
+// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 2
+// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 2
  
  template<typename tx>
  tx ftemplate(int n) {
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp

index 43a17c9..fdba4e9 100644 (file)
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -12,9 +12,9 @@
  // CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = weak addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
  
  // Check that the execution mode of all 3 target regions is set to Spmd Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 2
+// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 2
+// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 2
  
  template<typename tx>
  tx ftemplate(int n) {
diff --git a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp

index bf8d470..659f1e6 100644 (file)
--- a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
@@ -16,10 +16,10 @@
  #define HEADER
  
  // Check that the execution mode of all 2 target regions on the gpu is set to NonSPMD Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l42}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l47}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 2
+// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 2
+// CHECK-DAG: {{@__omp_offloading_.+l42}}_exec_mode = weak constant i8 2
+// CHECK-DAG: {{@__omp_offloading_.+l47}}_exec_mode = weak constant i8 2
  
  #define N 1000
  
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp

index bbcc33b..f8cabbe 100644 (file)
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
@@ -16,10 +16,10 @@
  #define HEADER
  
  // Check that the execution mode of all 2 target regions on the gpu is set to NonSPMD Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l48}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l53}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 2
+// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 2
+// CHECK-DAG: {{@__omp_offloading_.+l48}}_exec_mode = weak constant i8 2
+// CHECK-DAG: {{@__omp_offloading_.+l53}}_exec_mode = weak constant i8 2
  
  #define N 1000
  #define M 10
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h

index d174cc8..2fec3e7 100644 (file)
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -128,6 +128,14 @@ enum class OMPScheduleType {
    LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierMask)
  };
  
+enum OMPTgtExecModeFlags : int8_t {
+  OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
+  OMP_TGT_EXEC_MODE_SPMD = 1 << 1,
+  OMP_TGT_EXEC_MODE_GENERIC_SPMD =
+      OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD,
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ OMP_TGT_EXEC_MODE_GENERIC_SPMD)
+};
+
  } // end namespace omp
  
  } // end namespace llvm
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp

index 182650a..23d0d8a 100644 (file)
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -3284,15 +3284,18 @@ struct AAKernelInfoFunction : AAKernelInfo {
      GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
          (Kernel->getName() + "_exec_mode").str());
      assert(ExecMode && "Kernel without exec mode?");
-    assert(ExecMode->getInitializer() &&
-           ExecMode->getInitializer()->isOneValue() &&
-           "Initially non-SPMD kernel has SPMD exec mode!");
+    assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!");
  
      // Set the global exec mode flag to indicate SPMD-Generic mode.
-    constexpr int SPMDGeneric = 2;
-    if (!ExecMode->getInitializer()->isZeroValue())
-      ExecMode->setInitializer(
-          ConstantInt::get(ExecMode->getInitializer()->getType(), SPMDGeneric));
+    assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
+           "ExecMode is not an integer!");
+    const int8_t ExecModeVal =
+        cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
+    assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC &&
+           "Initially non-SPMD kernel has SPMD exec mode!");
+    ExecMode->setInitializer(
+        ConstantInt::get(ExecMode->getInitializer()->getType(),
+                         ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD));
  
      // Next rewrite the init and deinit calls to indicate we use SPMD-mode now.
      const int InitIsSPMDArgNo = 1;
diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll

index e4c341f..d929d0b 100644 (file)
--- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
@@ -11,7 +11,7 @@ target triple = "nvptx64"
  ; CHECK: @[[KERNEL0_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
  ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32
  ; CHECK: @[[KERNEL1_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
-; CHECK: @[[KERNEL2_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
+; CHECK: @[[KERNEL2_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
  ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
  ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
  ;.
diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll

index 3be3560..d4069d8 100644 (file)
--- a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
+++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
@@ -13,7 +13,7 @@ target triple = "nvptx64"
  
  ;.
  ; CHECK: @[[IS_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0
-; CHECK: @[[WILL_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
+; CHECK: @[[WILL_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
  ; CHECK: @[[NON_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
  ; CHECK: @[[WILL_NOT_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
  ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8
diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll

index 64ea133..3cf26fc 100644 (file)
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -91,10 +91,10 @@
  ;.
  ; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
  ; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
-; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
-; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
-; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
-; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
+; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
+; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
+; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
+; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
  ; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
  ; AMDGPU: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata"
  ; AMDGPU: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
@@ -102,10 +102,10 @@
  ;.
  ; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
  ; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
-; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
-; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
-; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
-; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
+; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
+; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
+; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
+; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
  ; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
  ; NVPTX: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata"
  ; NVPTX: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
diff --git a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll

index 75f8178..950e5fc 100644 (file)
--- a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
@@ -23,7 +23,7 @@ target triple = "nvptx64"
  ;.
  ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
  ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
-; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
+; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
  ; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_fd02_404433c2_main_l5_exec_mode], section "llvm.metadata"
  ;.
  define weak void @__omp_offloading_fd02_404433c2_main_l5(double* nonnull align 8 dereferenceable(8) %x) local_unnamed_addr #0 {
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll

index f4e2f75..d77398d 100644 (file)
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
@@ -45,7 +45,7 @@ target triple = "nvptx64"
  ;.
  ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
  ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
-; CHECK: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
+; CHECK: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
  ; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_2a_fbfa7a_sequential_loop_l6_exec_mode], section "llvm.metadata"
  ;.
  ; CHECK-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt

index 7f77bcc..80c3e9b 100644 (file)
--- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt
@@ -22,7 +22,10 @@ libomptarget_say("Building CUDA offloading plugin.")
  # Define the suffix for the runtime messaging dumps.
  add_definitions(-DTARGET_NAME=CUDA)
  
-include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS})
+include_directories(
+  ${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}
+  ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
+)
  
  set(LIBOMPTARGET_DLOPEN_LIBCUDA OFF)
  option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" ${LIBOMPTARGET_DLOPEN_LIBCUDA})
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp

index 0f61f6c..6b370f1 100644 (file)
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -28,6 +28,8 @@
  
  #include "MemoryManager.h"
  
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+
  // Utility for retrieving and printing CUDA error string.
  #ifdef OMPTARGET_DEBUG
  #define CUDA_ERR_STRING(err)                                                   \
@@ -71,28 +73,17 @@ struct FuncOrGblEntryTy {
    std::vector<__tgt_offload_entry> Entries;
  };
  
-enum ExecutionModeType {
-  SPMD,         // constructors, destructors,
-                // combined constructs (`teams distribute parallel for [simd]`)
-  GENERIC,      // everything else
-  SPMD_GENERIC, // Generic kernel with SPMD execution
-  NONE
-};
-
  /// Use a single entity to encode a kernel and a set of flags.
  struct KernelTy {
    CUfunction Func;
  
    // execution mode of kernel
-  // 0 - SPMD mode (without master warp)
-  // 1 - Generic mode (with master warp)
-  // 2 - SPMD mode execution with Generic mode semantics.
-  int8_t ExecutionMode;
+  llvm::omp::OMPTgtExecModeFlags ExecutionMode;
  
    /// Maximal number of threads per block for this kernel.
    int MaxThreadsPerBlock = 0;
  
-  KernelTy(CUfunction _Func, int8_t _ExecutionMode)
+  KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode)
        : Func(_Func), ExecutionMode(_ExecutionMode) {}
  };
  
@@ -867,7 +858,7 @@ public:
           DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
  
        // default value GENERIC (in case symbol is missing from cubin file)
-      int8_t ExecModeVal = ExecutionModeType::GENERIC;
+      llvm::omp::OMPTgtExecModeFlags ExecModeVal;
        std::string ExecModeNameStr(E->name);
        ExecModeNameStr += "_exec_mode";
        const char *ExecModeName = ExecModeNameStr.c_str();
@@ -876,9 +867,9 @@ public:
        size_t CUSize;
        Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
        if (Err == CUDA_SUCCESS) {
-        if (CUSize != sizeof(int8_t)) {
+        if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
            DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
-             ExecModeName, CUSize, sizeof(int8_t));
+             ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
            return nullptr;
          }
  
@@ -890,12 +881,6 @@ public:
            CUDA_ERR_STRING(Err);
            return nullptr;
          }
-
-        if (ExecModeVal < 0 || ExecModeVal > 2) {
-          DP("Error wrong exec_mode value specified in cubin file: %d\n",
-             ExecModeVal);
-          return nullptr;
-        }
        } else {
          DP("Loading global exec_mode '%s' - symbol missing, using default "
             "value GENERIC (1)\n",
@@ -1098,12 +1083,19 @@ public:
  
      KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
  
+    const bool IsSPMDGenericMode =
+        KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
+    const bool IsSPMDMode =
+        KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
+    const bool IsGenericMode =
+        KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
+
      int CudaThreadsPerBlock;
      if (ThreadLimit > 0) {
        DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
        CudaThreadsPerBlock = ThreadLimit;
        // Add master warp if necessary
-      if (KernelInfo->ExecutionMode == GENERIC) {
+      if (IsGenericMode) {
          DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
          CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
        }
@@ -1136,13 +1128,21 @@ public:
      unsigned int CudaBlocksPerGrid;
      if (TeamNum <= 0) {
        if (LoopTripCount > 0 && EnvNumTeams < 0) {
-        if (KernelInfo->ExecutionMode == SPMD) {
+        if (IsSPMDGenericMode) {
+          // If we reach this point, then we are executing a kernel that was
+          // transformed from Generic-mode to SPMD-mode. This kernel has
+          // SPMD-mode execution, but needs its blocks to be scheduled
+          // differently because the current loop trip count only applies to the
+          // `teams distribute` region and will create var too few blocks using
+          // the regular SPMD-mode method.
+          CudaBlocksPerGrid = LoopTripCount;
+        } else if (IsSPMDMode) {
            // We have a combined construct, i.e. `target teams distribute
            // parallel for [simd]`. We launch so many teams so that each thread
            // will execute one iteration of the loop. round up to the nearest
            // integer
            CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
-        } else if (KernelInfo->ExecutionMode == GENERIC) {
+        } else if (IsGenericMode) {
            // If we reach this point, then we have a non-combined construct, i.e.
            // `teams distribute` with a nested `parallel for` and each team is
            // assigned one iteration of the `distribute` loop. E.g.:
@@ -1156,16 +1156,9 @@ public:
            // Threads within a team will execute the iterations of the `parallel`
            // loop.
            CudaBlocksPerGrid = LoopTripCount;
-        } else if (KernelInfo->ExecutionMode == SPMD_GENERIC) {
-          // If we reach this point, then we are executing a kernel that was
-          // transformed from Generic-mode to SPMD-mode. This kernel has
-          // SPMD-mode execution, but needs its blocks to be scheduled
-          // differently because the current loop trip count only applies to the
-          // `teams distribute` region and will create var too few blocks using
-          // the regular SPMD-mode method.
-          CudaBlocksPerGrid = LoopTripCount;
          } else {
-          REPORT("Unknown execution mode: %d\n", KernelInfo->ExecutionMode);
+          REPORT("Unknown execution mode: %d\n",
+                 static_cast<int8_t>(KernelInfo->ExecutionMode));
            return OFFLOAD_FAIL;
          }
          DP("Using %d teams due to loop trip count %" PRIu32
@@ -1185,16 +1178,12 @@ public:
      }
  
      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-         "Launching kernel %s with %d blocks and %d threads in %s "
-         "mode\n",
+         "Launching kernel %s with %d blocks and %d threads in %s mode\n",
           (getOffloadEntry(DeviceId, TgtEntryPtr))
               ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
               : "(null)",
           CudaBlocksPerGrid, CudaThreadsPerBlock,
-         (KernelInfo->ExecutionMode != SPMD
-              ? (KernelInfo->ExecutionMode == GENERIC ? "Generic"
-                                                      : "SPMD-Generic")
-              : "SPMD"));
+         (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
  
      CUstream Stream = getStream(DeviceId, AsyncInfo);
      Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
author	Shilei Tian <tianshilei1992@gmail.com>
	Wed, 22 Sep 2021 15:40:37 +0000 (11:40 -0400)
committer	Shilei Tian <tianshilei1992@gmail.com>
	Wed, 22 Sep 2021 15:40:52 +0000 (11:40 -0400)
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp		patch \| blob \| history
clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp		patch \| blob \| history
clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp		patch \| blob \| history
clang/test/OpenMP/nvptx_target_simd_codegen.cpp		patch \| blob \| history
clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp		patch \| blob \| history
llvm/include/llvm/Frontend/OpenMP/OMPConstants.h		patch \| blob \| history
llvm/lib/Transforms/IPO/OpenMPOpt.cpp		patch \| blob \| history
llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll		patch \| blob \| history
llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll		patch \| blob \| history
llvm/test/Transforms/OpenMP/spmdization.ll		patch \| blob \| history
llvm/test/Transforms/OpenMP/spmdization_assumes.ll		patch \| blob \| history
llvm/test/Transforms/OpenMP/spmdization_guarding.ll		patch \| blob \| history
openmp/libomptarget/plugins/cuda/CMakeLists.txt		patch \| blob \| history
openmp/libomptarget/plugins/cuda/src/rtl.cpp		patch \| blob \| history