From 5cf2a37f1255700d4da9d5f45e82bdfff09aee8c Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" Date: Mon, 8 Feb 2021 12:29:29 -0500 Subject: [PATCH] [HIP] Emit kernel symbol Currently clang uses stub function to launch kernel. This is inconvenient to interop with C++ programs since the stub function has different name as kernel, which is required by ROCm debugger. This patch emits a variable symbol which has the same name as the kernel and uses it to register and launch the kernel. This allows C++ program to launch a kernel by using the original kernel name. Reviewed by: Artem Belevich Differential Revision: https://reviews.llvm.org/D86376 --- clang/lib/CodeGen/CGCUDANV.cpp | 53 +++++++++++++++-- clang/lib/CodeGen/CGCUDARuntime.h | 8 +++ clang/lib/CodeGen/CGExpr.cpp | 22 ++++++- clang/lib/CodeGen/CodeGenModule.cpp | 16 +++++- clang/test/CodeGenCUDA/Inputs/cuda.h | 12 +++- clang/test/CodeGenCUDA/cxx-call-kernel.cpp | 19 ++++++ clang/test/CodeGenCUDA/kernel-dbg-info.cu | 5 +- clang/test/CodeGenCUDA/kernel-stub-name.cu | 92 +++++++++++++++++++++++++++--- clang/test/CodeGenCUDA/unnamed-types.cu | 4 +- 9 files changed, 208 insertions(+), 23 deletions(-) create mode 100644 clang/test/CodeGenCUDA/cxx-call-kernel.cpp diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 57e3b15..3a311ab 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -42,12 +42,18 @@ private: llvm::LLVMContext &Context; /// Convenience reference to the current module llvm::Module &TheModule; - /// Keeps track of kernel launch stubs emitted in this module + /// Keeps track of kernel launch stubs and handles emitted in this module struct KernelInfo { - llvm::Function *Kernel; + llvm::Function *Kernel; // stub function to help launch kernel const Decl *D; }; llvm::SmallVector EmittedKernels; + // Map a device stub function to a symbol for identifying kernel in host code. + // For CUDA, the symbol for identifying the kernel is the same as the device + // stub function. For HIP, they are different. + llvm::DenseMap KernelHandles; + // Map a kernel handle to the kernel stub. + llvm::DenseMap KernelStubs; struct VarInfo { llvm::GlobalVariable *Var; const VarDecl *D; @@ -154,6 +160,12 @@ private: public: CGNVCUDARuntime(CodeGenModule &CGM); + llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override; + llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override { + auto Loc = KernelStubs.find(Handle); + assert(Loc != KernelStubs.end()); + return Loc->second; + } void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; void handleVarRegistration(const VarDecl *VD, llvm::GlobalVariable &Var) override; @@ -272,6 +284,10 @@ std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) { void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) { EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); + if (auto *GV = dyn_cast(KernelHandles[CGF.CurFn])) { + GV->setLinkage(CGF.CurFn->getLinkage()); + GV->setInitializer(CGF.CurFn); + } if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), CudaFeature::CUDA_USES_NEW_LAUNCH) || (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI)) @@ -350,7 +366,8 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, ShmemSize.getPointer(), Stream.getPointer()}); // Emit the call to cudaLaunch - llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); + llvm::Value *Kernel = + CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy); CallArgList LaunchKernelArgs; LaunchKernelArgs.add(RValue::get(Kernel), cudaLaunchKernelFD->getParamDecl(0)->getType()); @@ -405,7 +422,8 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, // Emit the call to cudaLaunch llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); - llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); + llvm::Value *Arg = + CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy); CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); CGF.EmitBranch(EndBlock); @@ -499,7 +517,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); llvm::Value *Args[] = { &GpuBinaryHandlePtr, - Builder.CreateBitCast(I.Kernel, VoidPtrTy), + Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy), KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), @@ -1070,3 +1088,28 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() { } return makeModuleCtorFunction(); } + +llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F, + GlobalDecl GD) { + auto Loc = KernelHandles.find(F); + if (Loc != KernelHandles.end()) + return Loc->second; + + if (!CGM.getLangOpts().HIP) { + KernelHandles[F] = F; + KernelStubs[F] = F; + return F; + } + + auto *Var = new llvm::GlobalVariable( + TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(), + /*Initializer=*/nullptr, + CGM.getMangledName( + GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel))); + Var->setAlignment(CGM.getPointerAlign().getAsAlign()); + Var->setDSOLocal(F->isDSOLocal()); + Var->setVisibility(F->getVisibility()); + KernelHandles[F] = Var; + KernelStubs[Var] = F; + return Var; +} diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h index 2f4b7ab..1c119dc 100644 --- a/clang/lib/CodeGen/CGCUDARuntime.h +++ b/clang/lib/CodeGen/CGCUDARuntime.h @@ -15,6 +15,7 @@ #ifndef LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H #define LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H +#include "clang/AST/GlobalDecl.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/GlobalValue.h" @@ -94,6 +95,13 @@ public: /// compilation is for host. virtual std::string getDeviceSideName(const NamedDecl *ND) = 0; + /// Get kernel handle by stub function. + virtual llvm::GlobalValue *getKernelHandle(llvm::Function *Stub, + GlobalDecl GD) = 0; + + /// Get kernel stub by kernel handle. + virtual llvm::Function *getKernelStub(llvm::GlobalValue *Handle) = 0; + /// Adjust linkage of shadow variables in host compilation. virtual void internalizeDeviceSideVar(const VarDecl *D, diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 05c553f..d57dd7f 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "CGCUDARuntime.h" #include "CGCXXABI.h" #include "CGCall.h" #include "CGCleanup.h" @@ -4871,8 +4872,12 @@ static CGCallee EmitDirectCallee(CodeGenFunction &CGF, GlobalDecl GD) { return CGCallee::forBuiltin(builtinID, FD); } - llvm::Constant *calleePtr = EmitFunctionDeclPointer(CGF.CGM, GD); - return CGCallee::forDirect(calleePtr, GD); + llvm::Constant *CalleePtr = EmitFunctionDeclPointer(CGF.CGM, GD); + if (CGF.CGM.getLangOpts().CUDA && !CGF.CGM.getLangOpts().CUDAIsDevice && + FD->hasAttr()) + CalleePtr = CGF.CGM.getCUDARuntime().getKernelStub( + cast(CalleePtr->stripPointerCasts())); + return CGCallee::forDirect(CalleePtr, GD); } CGCallee CodeGenFunction::EmitCallee(const Expr *E) { @@ -5266,6 +5271,19 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, const CGCallee &OrigCallee Callee.setFunctionPointer(CalleePtr); } + // HIP function pointer contains kernel handle when it is used in triple + // chevron. The kernel stub needs to be loaded from kernel handle and used + // as callee. + if (CGM.getLangOpts().HIP && !CGM.getLangOpts().CUDAIsDevice && + isa(E) && + (!TargetDecl || !isa(TargetDecl))) { + llvm::Value *Handle = Callee.getFunctionPointer(); + Handle->dump(); + auto *Cast = + Builder.CreateBitCast(Handle, Handle->getType()->getPointerTo()); + auto *Stub = Builder.CreateLoad(Address(Cast, CGM.getPointerAlign())); + Callee.setFunctionPointer(Stub); + } llvm::CallBase *CallOrInvoke = nullptr; RValue Call = EmitCall(FnInfo, Callee, ReturnValue, Args, &CallOrInvoke, E->getExprLoc()); diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 0d499a5..750439d 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -3571,9 +3571,19 @@ llvm::Constant *CodeGenModule::GetAddrOfFunction(GlobalDecl GD, } StringRef MangledName = getMangledName(GD); - return GetOrCreateLLVMFunction(MangledName, Ty, GD, ForVTable, DontDefer, - /*IsThunk=*/false, llvm::AttributeList(), - IsForDefinition); + auto *F = GetOrCreateLLVMFunction(MangledName, Ty, GD, ForVTable, DontDefer, + /*IsThunk=*/false, llvm::AttributeList(), + IsForDefinition); + // Returns kernel handle for HIP kernel stub function. + if (LangOpts.CUDA && !LangOpts.CUDAIsDevice && + cast(GD.getDecl())->hasAttr()) { + auto *Handle = getCUDARuntime().getKernelHandle( + cast(F->stripPointerCasts()), GD); + if (IsForDefinition) + return F; + return llvm::ConstantExpr::getBitCast(Handle, Ty->getPointerTo()); + } + return F; } static const FunctionDecl * diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h index daa6328..af395b3 100644 --- a/clang/test/CodeGenCUDA/Inputs/cuda.h +++ b/clang/test/CodeGenCUDA/Inputs/cuda.h @@ -2,6 +2,7 @@ #include +#if __HIP__ || __CUDA__ #define __constant__ __attribute__((constant)) #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -11,13 +12,22 @@ #define __managed__ __attribute__((managed)) #endif #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__))) +#else +#define __constant__ +#define __device__ +#define __global__ +#define __host__ +#define __shared__ +#define __managed__ +#define __launch_bounds__(...) +#endif struct dim3 { unsigned x, y, z; __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {} }; -#ifdef __HIP__ +#if __HIP__ || HIP_PLATFORM typedef struct hipStream *hipStream_t; typedef enum hipError {} hipError_t; int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, diff --git a/clang/test/CodeGenCUDA/cxx-call-kernel.cpp b/clang/test/CodeGenCUDA/cxx-call-kernel.cpp new file mode 100644 index 0000000..ae58dcd --- /dev/null +++ b/clang/test/CodeGenCUDA/cxx-call-kernel.cpp @@ -0,0 +1,19 @@ +// RUN: %clang_cc1 -x hip -emit-llvm-bc %s -o %t.hip.bc +// RUN: %clang_cc1 -mlink-bitcode-file %t.hip.bc -DHIP_PLATFORM -emit-llvm \ +// RUN: %s -o - | FileCheck %s + +#include "Inputs/cuda.h" + +// CHECK: @_Z2g1i = constant void (i32)* @_Z17__device_stub__g1i, align 8 +#if __HIP__ +__global__ void g1(int x) {} +#else +extern void g1(int x); + +// CHECK: call i32 @hipLaunchKernel{{.*}}@_Z2g1i +void test() { + hipLaunchKernel((void*)g1, 1, 1, nullptr, 0, 0); +} + +// CHECK: __hipRegisterFunction{{.*}}@_Z2g1i +#endif diff --git a/clang/test/CodeGenCUDA/kernel-dbg-info.cu b/clang/test/CodeGenCUDA/kernel-dbg-info.cu index 4aa1f35..7e8522a 100644 --- a/clang/test/CodeGenCUDA/kernel-dbg-info.cu +++ b/clang/test/CodeGenCUDA/kernel-dbg-info.cu @@ -30,6 +30,9 @@ extern "C" __global__ void ckernel(int *a) { *a = 1; } +// Kernel symbol for launching kernel. +// CHECK: @[[SYM:ckernel]] = constant void (i32*)* @__device_stub__ckernel, align 8 + // Device side kernel names // CHECK: @[[CKERN:[0-9]*]] = {{.*}} c"ckernel\00" @@ -40,7 +43,7 @@ extern "C" __global__ void ckernel(int *a) { // Make sure there is no !dbg between function attributes and '{' // CHECK: define{{.*}} void @[[CSTUB:__device_stub__ckernel]]{{.*}} #{{[0-9]+}} { // CHECK-NOT: call {{.*}}@hipLaunchByPtr{{.*}}!dbg -// CHECK: call {{.*}}@hipLaunchByPtr{{.*}}@[[CSTUB]] +// CHECK: call {{.*}}@hipLaunchByPtr{{.*}}@[[SYM]] // CHECK-NOT: ret {{.*}}!dbg // CHECK-LABEL: define {{.*}}@_Z8hostfuncPi{{.*}}!dbg diff --git a/clang/test/CodeGenCUDA/kernel-stub-name.cu b/clang/test/CodeGenCUDA/kernel-stub-name.cu index b66a7ba..0c504b6 100644 --- a/clang/test/CodeGenCUDA/kernel-stub-name.cu +++ b/clang/test/CodeGenCUDA/kernel-stub-name.cu @@ -2,10 +2,17 @@ // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -fcuda-include-gpubinary %t -o - -x hip\ -// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=CHECK +// RUN: | FileCheck %s #include "Inputs/cuda.h" +// Kernel handles + +// CHECK: @[[HCKERN:ckernel]] = constant void ()* @__device_stub__ckernel, align 8 +// CHECK: @[[HNSKERN:_ZN2ns8nskernelEv]] = constant void ()* @_ZN2ns23__device_stub__nskernelEv, align 8 +// CHECK: @[[HTKERN:_Z10kernelfuncIiEvv]] = linkonce_odr constant void ()* @_Z25__device_stub__kernelfuncIiEvv, align 8 +// CHECK: @[[HDKERN:_Z11kernel_declv]] = external constant void ()*, align 8 + extern "C" __global__ void ckernel() {} namespace ns { @@ -17,6 +24,11 @@ __global__ void kernelfunc() {} __global__ void kernel_decl(); +void (*kernel_ptr)(); +void *void_ptr; + +void launch(void *kern); + // Device side kernel names // CHECK: @[[CKERN:[0-9]*]] = {{.*}} c"ckernel\00" @@ -26,16 +38,20 @@ __global__ void kernel_decl(); // Non-template kernel stub functions // CHECK: define{{.*}}@[[CSTUB:__device_stub__ckernel]] -// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[CSTUB]] +// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HCKERN]] // CHECK: define{{.*}}@[[NSSTUB:_ZN2ns23__device_stub__nskernelEv]] -// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[NSSTUB]] +// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HNSKERN]] + -// CHECK-LABEL: define{{.*}}@_Z8hostfuncv() +// Check kernel stub is used for triple chevron + +// CHECK-LABEL: define{{.*}}@_Z4fun1v() // CHECK: call void @[[CSTUB]]() // CHECK: call void @[[NSSTUB]]() // CHECK: call void @[[TSTUB:_Z25__device_stub__kernelfuncIiEvv]]() // CHECK: call void @[[DSTUB:_Z26__device_stub__kernel_declv]]() -void hostfunc(void) { + +void fun1(void) { ckernel<<<1, 1>>>(); ns::nskernel<<<1, 1>>>(); kernelfunc<<<1, 1>>>(); @@ -45,11 +61,69 @@ void hostfunc(void) { // Template kernel stub functions // CHECK: define{{.*}}@[[TSTUB]] -// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[TSTUB]] +// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HTKERN]] + +// Check declaration of stub function for external kernel. // CHECK: declare{{.*}}@[[DSTUB]] +// Check kernel handle is used for passing the kernel as a function pointer + +// CHECK-LABEL: define{{.*}}@_Z4fun2v() +// CHECK: call void @_Z6launchPv({{.*}}[[HCKERN]] +// CHECK: call void @_Z6launchPv({{.*}}[[HNSKERN]] +// CHECK: call void @_Z6launchPv({{.*}}[[HTKERN]] +// CHECK: call void @_Z6launchPv({{.*}}[[HDKERN]] +void fun2() { + launch((void *)ckernel); + launch((void *)ns::nskernel); + launch((void *)kernelfunc); + launch((void *)kernel_decl); +} + +// Check kernel handle is used for assigning a kernel to a function pointer + +// CHECK-LABEL: define{{.*}}@_Z4fun3v() +// CHECK: store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr, align 8 +// CHECK: store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr, align 8 +// CHECK: store i8* bitcast (void ()** @[[HCKERN]] to i8*), i8** @void_ptr, align 8 +// CHECK: store i8* bitcast (void ()** @[[HCKERN]] to i8*), i8** @void_ptr, align 8 +void fun3() { + kernel_ptr = ckernel; + kernel_ptr = &ckernel; + void_ptr = (void *)ckernel; + void_ptr = (void *)&ckernel; +} + +// Check kernel stub is loaded from kernel handle when function pointer is +// used with triple chevron + +// CHECK-LABEL: define{{.*}}@_Z4fun4v() +// CHECK: store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr +// CHECK: call i32 @_Z16hipConfigureCall4dim3S_mP9hipStream +// CHECK: %[[HANDLE:.*]] = load void ()*, void ()** @kernel_ptr, align 8 +// CHECK: %[[CAST:.*]] = bitcast void ()* %[[HANDLE]] to void ()** +// CHECK: %[[STUB:.*]] = load void ()*, void ()** %[[CAST]], align 8 +// CHECK: call void %[[STUB]]() +void fun4() { + kernel_ptr = ckernel; + kernel_ptr<<<1,1>>>(); +} + +// Check kernel handle is passed to a function + +// CHECK-LABEL: define{{.*}}@_Z4fun5v() +// CHECK: store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr +// CHECK: %[[HANDLE:.*]] = load void ()*, void ()** @kernel_ptr, align 8 +// CHECK: %[[CAST:.*]] = bitcast void ()* %[[HANDLE]] to i8* +// CHECK: call void @_Z6launchPv(i8* %[[CAST]]) +void fun5() { + kernel_ptr = ckernel; + launch((void *)kernel_ptr); +} + // CHECK-LABEL: define{{.*}}@__hip_register_globals -// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[CSTUB]]{{.*}}@[[CKERN]] -// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[NSSTUB]]{{.*}}@[[NSKERN]] -// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[TSTUB]]{{.*}}@[[TKERN]] +// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HCKERN]]{{.*}}@[[CKERN]] +// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HNSKERN]]{{.*}}@[[NSKERN]] +// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HTKERN]]{{.*}}@[[TKERN]] +// CHECK-NOT: call{{.*}}@__hipRegisterFunction{{.*}}@[[HDKERN]]{{.*}}@[[DKERN]] diff --git a/clang/test/CodeGenCUDA/unnamed-types.cu b/clang/test/CodeGenCUDA/unnamed-types.cu index f598117..b59d5f4 100644 --- a/clang/test/CodeGenCUDA/unnamed-types.cu +++ b/clang/test/CodeGenCUDA/unnamed-types.cu @@ -54,7 +54,7 @@ void f1(float *p) { [] __device__ (float x) { return x + 5.f; }); } // HOST: @__hip_register_globals -// HOST: __hipRegisterFunction{{.*}}@_Z17__device_stub__k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_{{.*}}@0 -// HOST: __hipRegisterFunction{{.*}}@_Z17__device_stub__k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_{{.*}}@1 +// HOST: __hipRegisterFunction{{.*}}@_Z2k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_{{.*}}@0 +// HOST: __hipRegisterFunction{{.*}}@_Z2k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_{{.*}}@1 // MSVC: __hipRegisterFunction{{.*}}@"??$k0@V@?0???R1?0??f1@@YAXPEAM@Z@QEBA@0@Z@@@YAXPEAMV@?0???R0?0??f1@@YAX0@Z@QEBA@0@Z@@Z{{.*}}@0 // MSVC: __hipRegisterFunction{{.*}}@"??$k1@V@?0??f1@@YAXPEAM@Z@V@?0??2@YAX0@Z@V@?0??2@YAX0@Z@@@YAXPEAMV@?0??f1@@YAX0@Z@V@?0??1@YAX0@Z@V@?0??1@YAX0@Z@@Z{{.*}}@1 -- 2.7.4