LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
+LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
LANGOPT(NoGPULib , 1, 0, "Indicate a build without the standard GPU libraries.")
LANGOPT(RenderScript , 1, 0, "RenderScript")
Flags<[CC1Option, NoArgumentUnused, HelpHidden]>,
HelpText<"Assert no thread in a parallel region modifies an ICV">,
MarshallingInfoFlag<LangOpts<"OpenMPNoThreadState">>;
+def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested-parallelism">, Group<f_Group>,
+ Flags<[CC1Option, NoArgumentUnused, HelpHidden]>,
+ HelpText<"Assert no nested parallel regions in the GPU">,
+ MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>;
def fopenmp_offload_mandatory : Flag<["-"], "fopenmp-offload-mandatory">, Group<f_Group>,
Flags<[CC1Option, NoArgumentUnused]>,
HelpText<"Do not create a host fallback if offloading to the device fails.">,
"__omp_rtl_assume_threads_oversubscription");
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
"__omp_rtl_assume_no_thread_state");
+ OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
+ "__omp_rtl_assume_no_nested_parallelism");
}
void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
CmdArgs.push_back("-fopenmp-assume-threads-oversubscription");
if (Args.hasArg(options::OPT_fopenmp_assume_no_thread_state))
CmdArgs.push_back("-fopenmp-assume-no-thread-state");
+ if (Args.hasArg(options::OPT_fopenmp_assume_no_nested_parallelism))
+ CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism");
if (Args.hasArg(options::OPT_fopenmp_offload_mandatory))
CmdArgs.push_back("-fopenmp-offload-mandatory");
break;
for (StringRef LibName : BCLibs)
CmdArgs.push_back(Args.MakeArgString(
- "--bitcode-library=" + Action::GetOffloadKindName(Action::OFK_OpenMP) +
- "-" + TC->getTripleString() + "-" + Arch + "=" + LibName));
+ "--bitcode-library=" +
+ Action::GetOffloadKindName(Action::OFK_OpenMP) + "-" +
+ TC->getTripleString() + "-" + Arch + "=" + LibName));
}
if (D.isUsingLTO(/* IsOffload */ true)) {
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-no-thread-state -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-STATE
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-no-nested-parallelism -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-NESTED
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -nogpulib -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-RUNTIME
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME
// expected-no-diagnostics
// CHECK: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
// CHECK: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
//.
// CHECK-EQ: @__omp_rtl_debug_kind = weak_odr hidden constant i32 111
// CHECK-EQ: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
// CHECK-EQ: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK-EQ: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-EQ: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
//.
// CHECK-DEFAULT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
// CHECK-DEFAULT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
// CHECK-DEFAULT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK-DEFAULT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-DEFAULT: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
//.
// CHECK-THREADS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
// CHECK-THREADS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
// CHECK-THREADS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 1
// CHECK-THREADS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-THREADS: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
//.
// CHECK-TEAMS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
// CHECK-TEAMS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1
// CHECK-TEAMS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK-TEAMS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-TEAMS: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
//.
// CHECK-STATE: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
// CHECK-STATE: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
// CHECK-STATE: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK-STATE: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 1
+// CHECK-STATE: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
+//.
+// CHECK-NESTED: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
+// CHECK-NESTED: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
+// CHECK-NESTED: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
+// CHECK-NESTED: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-NESTED: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 1
//.
// CHECK-RUNTIME-NOT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
// CHECK-RUNTIME-NOT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1
// CHECK-RUNTIME-NOT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK-RUNTIME-NOT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-RUNTIME-NOT: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
//.
void foo() {
#pragma omp target
/// explicitly disabled by the user.
bool mayUseThreadStates();
+/// Indicates if this kernel may require data environments for nested
+/// parallelism, or if it was explicitly disabled by the user.
+bool mayUseNestedParallelism();
+
} // namespace config
} // namespace _OMP
// defined by CGOpenMPRuntimeGPU
extern uint32_t __omp_rtl_debug_kind;
extern uint32_t __omp_rtl_assume_no_thread_state;
+extern uint32_t __omp_rtl_assume_no_nested_parallelism;
// TODO: We want to change the name as soon as the old runtime is gone.
// This variable should be visibile to the plugin so we override the default
bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; }
+bool config::mayUseNestedParallelism() {
+ return !__omp_rtl_assume_no_nested_parallelism;
+}
+
#pragma omp end declare target
uint32_t TId = mapping::getThreadIdInBlock();
+ // Assert the parallelism level is zero if disabled by the user.
+ ASSERT((config::mayUseNestedParallelism() || icv::Level == 0) &&
+ "nested parallelism while disabled");
+
// Handle the serialized case first, same for SPMD/non-SPMD:
// 1) if-clause(0)
- // 2) nested parallel regions
- // 3) parallel in task or other thread state inducing construct
- if (OMP_UNLIKELY(!if_expr || icv::Level || state::HasThreadState)) {
+ // 2) parallel in task or other thread state inducing construct
+ // 3) nested parallel regions
+ if (OMP_UNLIKELY(!if_expr || state::HasThreadState ||
+ (config::mayUseNestedParallelism() && icv::Level))) {
state::DateEnvironmentRAII DERAII(ident);
++icv::Level;
invokeMicrotask(TId, 0, fn, args, nargs);