From 47554a0c73ee703362d913d3a149f104819c8365 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 23 Dec 2022 15:52:41 -0500 Subject: [PATCH] AMDGPU: Use more accurate IR type for block handle The device library uses this as a struct with a pointer sized integer and 2 ints. --- .../Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp | 16 +++++++++++++--- llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll | 10 +++++----- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp index 971be57..f242cbc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -70,6 +70,10 @@ ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() { bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { auto &C = M.getContext(); bool Changed = false; + + // ptr kernel_object, i32 private_segment_size, i32 group_segment_size + StructType *HandleTy = nullptr; + for (auto &F : M.functions()) { if (F.hasFnAttribute("enqueued-block")) { if (!F.hasName()) { @@ -80,11 +84,17 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { } LLVM_DEBUG(dbgs() << "found enqueued kernel: " << F.getName() << '\n'); auto RuntimeHandle = (F.getName() + ".runtime_handle").str(); - auto T = ArrayType::get(Type::getInt64Ty(C), 2); + if (!HandleTy) { + Type *Int32 = Type::getInt32Ty(C); + HandleTy = StructType::create( + C, {Type::getInt8Ty(C)->getPointerTo(0), Int32, Int32}, + "block.runtime.handle.t"); + } + auto *GV = new GlobalVariable( - M, T, + M, HandleTy, /*isConstant=*/false, GlobalValue::ExternalLinkage, - /*Initializer=*/Constant::getNullValue(T), RuntimeHandle, + /*Initializer=*/Constant::getNullValue(HandleTy), RuntimeHandle, /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, /*isExternallyInitialized=*/false); diff --git a/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll b/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll index 13ed6959..c454928 100644 --- a/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll @@ -109,11 +109,11 @@ define internal amdgpu_kernel void @1(<{ i32, i32, ptr addrspace(1), i8 }> %arg) attributes #0 = { "enqueued-block" } ;. ; CHECK: @[[KERNEL_ADDRESS_USER:[a-zA-Z0-9_$"\\.-]+]] = global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @block_has_used_kernel_address.runtime_handle to ptr)] -; CHECK: @[[__TEST_BLOCK_INVOKE_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [2 x i64] zeroinitializer -; CHECK: @[[__TEST_BLOCK_INVOKE_2_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [2 x i64] zeroinitializer -; CHECK: @[[BLOCK_HAS_USED_KERNEL_ADDRESS_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [2 x i64] zeroinitializer -; CHECK: @[[__AMDGPU_ENQUEUED_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [2 x i64] zeroinitializer -; CHECK: @[[__AMDGPU_ENQUEUED_KERNEL_1_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [2 x i64] zeroinitializer +; CHECK: @[[__TEST_BLOCK_INVOKE_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer +; CHECK: @[[__TEST_BLOCK_INVOKE_2_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer +; CHECK: @[[BLOCK_HAS_USED_KERNEL_ADDRESS_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer +; CHECK: @[[__AMDGPU_ENQUEUED_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer +; CHECK: @[[__AMDGPU_ENQUEUED_KERNEL_1_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer ;. ; CHECK-LABEL: define {{[^@]+}}@non_caller ; CHECK-SAME: (ptr addrspace(1) [[A:%.*]], i8 [[B:%.*]], ptr addrspace(1) [[C:%.*]], i64 [[D:%.*]]) { -- 2.7.4