[OpenMP][NFC] Move the `noinline` to the parallel entry point

author Johannes Doerfert <johannes@jdoerfert.de>

Mon, 29 Mar 2021 01:13:38 +0000 (20:13 -0500)

committer Johannes Doerfert <johannes@jdoerfert.de>

Tue, 30 Mar 2021 06:12:45 +0000 (01:12 -0500)
author Johannes Doerfert <johannes@jdoerfert.de>
Mon, 29 Mar 2021 01:13:38 +0000 (20:13 -0500)
committer Johannes Doerfert <johannes@jdoerfert.de>
Tue, 30 Mar 2021 06:12:45 +0000 (01:12 -0500)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

index 0cef4f0..51b8670 100644 (file)
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2093,14 +2093,6 @@ void CGOpenMPRuntimeGPU::emitNonSPMDParallelCall(
    // Force inline this outlined function at its call site.
    Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
  
-  // Ensure we do not inline the function. This is trivially true for the ones
-  // passed to __kmpc_fork_call but the ones calles in serialized regions
-  // could be inlined. This is not a perfect but it is closer to the invariant
-  // we want, namely, every data environment starts with a new function.
-  // TODO: We should pass the if condition to the runtime function and do the
-  //       handling there. Much cleaner code.
-  cast<llvm::Function>(OutlinedFn)->addFnAttr(llvm::Attribute::NoInline);
-
    Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
                                                        /*Name=*/".zero.addr");
    CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
@@ -4216,6 +4208,15 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
    auto *Fn = llvm::Function::Create(
        CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
        Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
+
+  // Ensure we do not inline the function. This is trivially true for the ones
+  // passed to __kmpc_fork_call but the ones calles in serialized regions
+  // could be inlined. This is not a perfect but it is closer to the invariant
+  // we want, namely, every data environment starts with a new function.
+  // TODO: We should pass the if condition to the runtime function and do the
+  //       handling there. Much cleaner code.
+  Fn->addFnAttr(llvm::Attribute::NoInline);
+
    CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
    Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
    Fn->setDoesNotRecurse();
diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp

index 593f7fa..f85d1d4 100644 (file)
--- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
@@ -4,7 +4,7 @@
  // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR
  // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
  // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
+// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -disable-O0-optnone | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
  // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
  // RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
  // expected-no-diagnostics
@@ -318,7 +318,8 @@ int bar(int n){
  // CHECK: [[EXIT]]
  // CHECK: ret void
  
-// CHECK: define internal void [[PARALLEL_FN4]](
+// CHECK: noinline
+// CHECK-NEXT: define internal void [[PARALLEL_FN4]](
  // CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
  // CHECK: store i[[SZ]] 45, i[[SZ]]* %a,
  // CHECK: call void @__kmpc_barrier(%struct.ident_t* @{{.+}}, i32 %{{.+}})
@@ -326,6 +327,9 @@ int bar(int n){
  
  // CHECK: declare void @__kmpc_barrier(%struct.ident_t*, i32) #[[#CONVERGENT:]]
  
+// CHECK: Function Attrs: convergent noinline norecurse nounwind
+// CHECK-NEXT: [[PARALLEL_FN4]]_wrapper
+
  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l58}}_worker()
  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l58}}(
  // CHECK-32: [[A_ADDR:%.+]] = alloca i32,
@@ -373,7 +377,6 @@ int bar(int n){
  // CHECK:  store i32 [[NEW_CC_VAL]], i32* [[CC]],
  // CHECK:  br label
  
-
  // CHECK: declare i32 @__kmpc_warp_active_thread_mask() #[[#CONVERGENT:]]
  // CHECK: declare void @__kmpc_syncwarp(i32) #[[#CONVERGENT:]]
author	Johannes Doerfert <johannes@jdoerfert.de>
	Mon, 29 Mar 2021 01:13:38 +0000 (20:13 -0500)
committer	Johannes Doerfert <johannes@jdoerfert.de>
	Tue, 30 Mar 2021 06:12:45 +0000 (01:12 -0500)
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp		patch \| blob \| history
clang/test/OpenMP/nvptx_parallel_codegen.cpp		patch \| blob \| history