[AMDGPU] Handle functions in llvm's global ctors and dtors list
authorReshabh Sharma <Reshabhkumar.Sharma@amd.com>
Fri, 6 Aug 2021 09:56:12 +0000 (15:26 +0530)
committerReshabh Sharma <Reshabhkumar.Sharma@amd.com>
Fri, 6 Aug 2021 10:23:33 +0000 (15:53 +0530)
This patch introduces a new code object metadata field, ".kind"
which is used to add support for init and fini kernels.

HSAStreamer will use function attributes, "device-init" and
"device-fini" to distinguish between init and fini kernels from
the regular kernels and will emit metadata with ".kind" set to
"init" and "fini" respectively.

To reduce the number of init and fini kernels, the ctors and
dtors present in the llvm's global.ctors and global.dtors lists
are called from a single init and fini kernel respectively.

Reviewed by: yaxunl

Differential Revision: https://reviews.llvm.org/D105682

llvm/docs/AMDGPUUsage.rst
llvm/lib/Target/AMDGPU/AMDGPU.h
llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp [new file with mode: 0644]
llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/lib/Target/AMDGPU/CMakeLists.txt
llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll [new file with mode: 0644]
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll [new file with mode: 0644]
llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll [new file with mode: 0644]
llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn

index a1dc554..bfbc77c 100644 (file)
@@ -3142,6 +3142,37 @@ same *vendor-name*.
                                                                   a register allocator
                                                                   created spill
                                                                   location.
+     ".kind"                             string                   The kind of the kernel
+                                                                  with the following
+                                                                  values:
+
+                                                                  "normal"
+                                                                    Regular kernels.
+
+                                                                  "init"
+                                                                    These kernels must be
+                                                                    invoked after loading
+                                                                    the containing code
+                                                                    object and must
+                                                                    complete before any
+                                                                    normal and fini
+                                                                    kernels in the same
+                                                                    code object are
+                                                                    invoked.
+
+                                                                  "fini"
+                                                                    These kernels must be
+                                                                    invoked before
+                                                                    unloading the
+                                                                    containing code object
+                                                                    and after all init and
+                                                                    normal kernels in the
+                                                                    same code object have
+                                                                    been invoked and
+                                                                    completed.
+
+                                                                  If omitted, "normal" is
+                                                                  assumed.
      =================================== ============== ========= ================================
 
 ..
index ca088e6..3d0a618 100644 (file)
@@ -114,6 +114,10 @@ ModulePass *createAMDGPUFixFunctionBitcastsPass();
 void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &);
 extern char &AMDGPUFixFunctionBitcastsID;
 
+ModulePass *createAMDGPUCtorDtorLoweringPass();
+void initializeAMDGPUCtorDtorLoweringPass(PassRegistry &);
+extern char &AMDGPUCtorDtorLoweringID;
+
 FunctionPass *createAMDGPULowerKernelArgumentsPass();
 void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
 extern char &AMDGPULowerKernelArgumentsID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
new file mode 100644 (file)
index 0000000..1111b12
--- /dev/null
@@ -0,0 +1,95 @@
+//===-- AMDGPUCtorDtorLowering.cpp - Handle global ctors and dtors --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This pass creates a unified init and fini kernel with the required metadata
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-lower-ctor-dtor"
+
+namespace {
+class AMDGPUCtorDtorLowering final : public ModulePass {
+  bool runOnModule(Module &M) override;
+
+public:
+  Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
+    StringRef InitOrFiniKernelName = "amdgcn.device.init";
+    if (!IsCtor)
+      InitOrFiniKernelName = "amdgcn.device.fini";
+
+    Function *InitOrFiniKernel = Function::createWithDefaultAttr(
+        FunctionType::get(Type::getVoidTy(M.getContext()), false),
+        GlobalValue::InternalLinkage, 0, InitOrFiniKernelName, &M);
+    BasicBlock *InitOrFiniKernelBB =
+        BasicBlock::Create(M.getContext(), "", InitOrFiniKernel);
+    ReturnInst::Create(M.getContext(), InitOrFiniKernelBB);
+
+    InitOrFiniKernel->setCallingConv(CallingConv::AMDGPU_KERNEL);
+    if (IsCtor)
+      InitOrFiniKernel->addFnAttr("device-init");
+    else
+      InitOrFiniKernel->addFnAttr("device-fini");
+    return InitOrFiniKernel;
+  }
+
+  bool createInitOrFiniKernel(Module &M, GlobalVariable *GV, bool IsCtor) {
+    if (!GV)
+      return false;
+    ConstantArray *GA = cast<ConstantArray>(GV->getInitializer());
+    if (GA->getNumOperands() == 0)
+      return false;
+    Function *InitOrFiniKernel = createInitOrFiniKernelFunction(M, IsCtor);
+    IRBuilder<> IRB(InitOrFiniKernel->getEntryBlock().getTerminator());
+    for (Value *V : GA->operands()) {
+      auto *CS = cast<ConstantStruct>(V);
+      if (Function *F = dyn_cast<Function>(CS->getOperand(1))) {
+        FunctionCallee Ctor =
+            M.getOrInsertFunction(F->getName(), IRB.getVoidTy());
+        IRB.CreateCall(Ctor);
+      }
+    }
+    appendToUsed(M, {InitOrFiniKernel});
+    return true;
+  }
+
+  static char ID;
+  AMDGPUCtorDtorLowering() : ModulePass(ID) {}
+};
+} // End anonymous namespace
+
+char AMDGPUCtorDtorLowering::ID = 0;
+char &llvm::AMDGPUCtorDtorLoweringID = AMDGPUCtorDtorLowering::ID;
+INITIALIZE_PASS(AMDGPUCtorDtorLowering, DEBUG_TYPE,
+                "Lower ctors and dtors for AMDGPU", false, false)
+
+ModulePass *llvm::createAMDGPUCtorDtorLoweringPass() {
+  return new AMDGPUCtorDtorLowering();
+}
+
+bool AMDGPUCtorDtorLowering::runOnModule(Module &M) {
+  bool Modified = false;
+  Modified |=
+      createInitOrFiniKernel(M, M.getGlobalVariable("llvm.global_ctors"),
+                             /*IsCtor =*/true);
+  Modified |=
+      createInitOrFiniKernel(M, M.getGlobalVariable("llvm.global_dtors"),
+                             /*IsCtor =*/false);
+  return Modified;
+}
index 8eeda7b..a50093f 100644 (file)
@@ -665,6 +665,10 @@ void MetadataStreamerV3::emitKernelAttrs(const Function &Func,
         Func.getFnAttribute("runtime-handle").getValueAsString().str(),
         /*Copy=*/true);
   }
+  if (Func.hasFnAttribute("device-init"))
+    Kern[".kind"] = Kern.getDocument()->getNode("init");
+  else if (Func.hasFnAttribute("device-fini"))
+    Kern[".kind"] = Kern.getDocument()->getNode("fini");
 }
 
 void MetadataStreamerV3::emitKernelArgs(const Function &Func,
index ac25e2b..045e1d5 100644 (file)
@@ -349,6 +349,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIOptimizeVGPRLiveRangePass(*PR);
   initializeSILoadStoreOptimizerPass(*PR);
   initializeAMDGPUFixFunctionBitcastsPass(*PR);
+  initializeAMDGPUCtorDtorLoweringPass(*PR);
   initializeAMDGPUAlwaysInlinePass(*PR);
   initializeAMDGPUAttributorPass(*PR);
   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
@@ -1014,6 +1015,7 @@ void AMDGPUPassConfig::addIRPasses() {
   disablePass(&PatchableFunctionID);
 
   addPass(createAMDGPUPrintfRuntimeBinding());
+  addPass(createAMDGPUCtorDtorLoweringPass());
 
   // This must occur before inlining, as the inliner will not look through
   // bitcast calls.
index fb2d1cd..78f4f8f 100644 (file)
@@ -53,6 +53,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUCodeGenPrepare.cpp
   AMDGPUExportClustering.cpp
   AMDGPUFixFunctionBitcasts.cpp
+  AMDGPUCtorDtorLowering.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUHSAMetadataStreamer.cpp
   AMDGPUInstCombineIntrinsic.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll
new file mode 100644 (file)
index 0000000..83ddad2
--- /dev/null
@@ -0,0 +1,39 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 |   FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 |   FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 |   FileCheck --check-prefix=PARSER %s
+
+@llvm.global_ctors = appending addrspace(1) global [2 x { i32, void ()*, i8*  }] [{ i32, void ()*, i8*  } { i32 1, void ()* @foo, i8* null  }, { i32, void ()*, i8*  } { i32 1, void ()* @foo.5, i8* null  }]
+
+define internal void @foo() {
+      ret void
+      
+}
+
+define internal void @foo.5() {
+      ret void
+      
+}
+
+; CHECK: ---
+; CHECK: .kind: init
+; CHECK: .name: amdgcn.device.init
+
+@llvm.global_dtors = appending addrspace(1) global [2 x { i32, void ()*, i8*  }] [{ i32, void ()*, i8*  } { i32 1, void ()* @bar, i8* null  }, { i32, void ()*, i8*  } { i32 1, void ()* @bar.5, i8* null  }]
+
+define internal void @bar() {
+      ret void
+      
+}
+
+define internal void @bar.5() {
+      ret void
+      
+}
+
+; CHECK: .kind: fini
+; CHECK: .name: amdgcn.device.fini
+
+; PARSER: AMDGPU HSA Metadata Parser Test: PASS
index 698d116..73909dc 100644 (file)
@@ -31,6 +31,7 @@
 ; GCN-O0-NEXT:    AMDGPU Printf lowering
 ; GCN-O0-NEXT:      FunctionPass Manager
 ; GCN-O0-NEXT:        Dominator Tree Construction
+; GCN-O0-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O0-NEXT:    Fix function bitcasts for AMDGPU
 ; GCN-O0-NEXT:    FunctionPass Manager
 ; GCN-O0-NEXT:      Early propagate attributes from kernels to functions
 ; GCN-O1-NEXT:    AMDGPU Printf lowering
 ; GCN-O1-NEXT:      FunctionPass Manager
 ; GCN-O1-NEXT:        Dominator Tree Construction
+; GCN-O1-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O1-NEXT:    Fix function bitcasts for AMDGPU
 ; GCN-O1-NEXT:    FunctionPass Manager
 ; GCN-O1-NEXT:      Early propagate attributes from kernels to functions
 ; GCN-O1-OPTS-NEXT:    AMDGPU Printf lowering
 ; GCN-O1-OPTS-NEXT:      FunctionPass Manager
 ; GCN-O1-OPTS-NEXT:        Dominator Tree Construction
+; GCN-O1-OPTS-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O1-OPTS-NEXT:    Fix function bitcasts for AMDGPU
 ; GCN-O1-OPTS-NEXT:    FunctionPass Manager
 ; GCN-O1-OPTS-NEXT:      Early propagate attributes from kernels to functions
 ; GCN-O2-NEXT:    AMDGPU Printf lowering
 ; GCN-O2-NEXT:      FunctionPass Manager
 ; GCN-O2-NEXT:        Dominator Tree Construction
+; GCN-O2-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O2-NEXT:    Fix function bitcasts for AMDGPU
 ; GCN-O2-NEXT:    FunctionPass Manager
 ; GCN-O2-NEXT:      Early propagate attributes from kernels to functions
 ; GCN-O3-NEXT:    AMDGPU Printf lowering
 ; GCN-O3-NEXT:      FunctionPass Manager
 ; GCN-O3-NEXT:        Dominator Tree Construction
+; GCN-O3-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O3-NEXT:    Fix function bitcasts for AMDGPU
 ; GCN-O3-NEXT:    FunctionPass Manager
 ; GCN-O3-NEXT:      Early propagate attributes from kernels to functions
diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
new file mode 100644 (file)
index 0000000..1a24707
--- /dev/null
@@ -0,0 +1,21 @@
+; RUN: opt -S -mtriple=amdgcn--  -amdgpu-lower-ctor-dtor < %s | FileCheck %s
+
+@llvm.global_ctors = appending addrspace(1) global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @foo, i8* null }]
+@llvm.global_dtors = appending addrspace(1) global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @bar, i8* null }]
+
+; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.init() #0
+; CHECK-NEXT: call void @foo
+
+; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.fini() #1
+; CHECK-NEXT: call void @bar
+
+define internal void @foo() {
+  ret void
+}
+
+define internal void @bar() {
+  ret void
+}
+
+; CHECK: attributes #0 = { "device-init" }
+; CHECK: attributes #1 = { "device-fini" } 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
new file mode 100644 (file)
index 0000000..e23ea23
--- /dev/null
@@ -0,0 +1,31 @@
+; RUN: opt -S -mtriple=amdgcn--  -amdgpu-lower-ctor-dtor < %s | FileCheck %s
+
+@llvm.global_ctors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @foo, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @foo.5, i8* null }]
+@llvm.global_dtors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @bar, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @bar.5, i8* null }]
+
+; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.init() #0
+; CHECK-NEXT: call void @foo
+; CHECK-NEXT: call void @foo.5
+
+; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.fini() #1
+; CHECK-NEXT: call void @bar
+; CHECK-NEXT: call void @bar.5
+
+define internal void @foo() {
+  ret void
+}
+
+define internal void @bar() {
+  ret void
+}
+
+define internal void @foo.5() {
+  ret void
+}
+
+define internal void @bar.5() {
+  ret void
+}
+
+; CHECK: attributes #0 = { "device-init" }
+; CHECK: attributes #1 = { "device-fini" } 
index 9a831ba..ed88d2d 100644 (file)
@@ -133,6 +133,7 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUCodeGenPrepare.cpp",
     "AMDGPUExportClustering.cpp",
     "AMDGPUFixFunctionBitcasts.cpp",
+    "AMDGPUCtorDtorLowering.cpp",
     "AMDGPUFrameLowering.cpp",
     "AMDGPUGlobalISelUtils.cpp",
     "AMDGPUHSAMetadataStreamer.cpp",