AMDGPU: Use more accurate IR type for block handle
authorMatt Arsenault <Matthew.Arsenault@amd.com>
Fri, 23 Dec 2022 20:52:41 +0000 (15:52 -0500)
committerMatt Arsenault <Matthew.Arsenault@amd.com>
Sat, 7 Jan 2023 02:23:28 +0000 (21:23 -0500)
The device library uses this as a struct with a pointer sized integer
and 2 ints.

llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll

index 971be57..f242cbc 100644 (file)
@@ -70,6 +70,10 @@ ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
 bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
   auto &C = M.getContext();
   bool Changed = false;
+
+  // ptr kernel_object, i32 private_segment_size, i32 group_segment_size
+  StructType *HandleTy = nullptr;
+
   for (auto &F : M.functions()) {
     if (F.hasFnAttribute("enqueued-block")) {
       if (!F.hasName()) {
@@ -80,11 +84,17 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
       }
       LLVM_DEBUG(dbgs() << "found enqueued kernel: " << F.getName() << '\n');
       auto RuntimeHandle = (F.getName() + ".runtime_handle").str();
-      auto T = ArrayType::get(Type::getInt64Ty(C), 2);
+      if (!HandleTy) {
+        Type *Int32 = Type::getInt32Ty(C);
+        HandleTy = StructType::create(
+            C, {Type::getInt8Ty(C)->getPointerTo(0), Int32, Int32},
+            "block.runtime.handle.t");
+      }
+
       auto *GV = new GlobalVariable(
-          M, T,
+          M, HandleTy,
           /*isConstant=*/false, GlobalValue::ExternalLinkage,
-          /*Initializer=*/Constant::getNullValue(T), RuntimeHandle,
+          /*Initializer=*/Constant::getNullValue(HandleTy), RuntimeHandle,
           /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
           AMDGPUAS::GLOBAL_ADDRESS,
           /*isExternallyInitialized=*/false);
index 13ed695..c454928 100644 (file)
@@ -109,11 +109,11 @@ define internal amdgpu_kernel void @1(<{ i32, i32, ptr addrspace(1), i8 }> %arg)
 attributes #0 = { "enqueued-block" }
 ;.
 ; CHECK: @[[KERNEL_ADDRESS_USER:[a-zA-Z0-9_$"\\.-]+]] = global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @block_has_used_kernel_address.runtime_handle to ptr)]
-; CHECK: @[[__TEST_BLOCK_INVOKE_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [2 x i64] zeroinitializer
-; CHECK: @[[__TEST_BLOCK_INVOKE_2_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [2 x i64] zeroinitializer
-; CHECK: @[[BLOCK_HAS_USED_KERNEL_ADDRESS_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [2 x i64] zeroinitializer
-; CHECK: @[[__AMDGPU_ENQUEUED_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [2 x i64] zeroinitializer
-; CHECK: @[[__AMDGPU_ENQUEUED_KERNEL_1_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [2 x i64] zeroinitializer
+; CHECK: @[[__TEST_BLOCK_INVOKE_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer
+; CHECK: @[[__TEST_BLOCK_INVOKE_2_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer
+; CHECK: @[[BLOCK_HAS_USED_KERNEL_ADDRESS_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer
+; CHECK: @[[__AMDGPU_ENQUEUED_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer
+; CHECK: @[[__AMDGPU_ENQUEUED_KERNEL_1_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer
 ;.
 ; CHECK-LABEL: define {{[^@]+}}@non_caller
 ; CHECK-SAME: (ptr addrspace(1) [[A:%.*]], i8 [[B:%.*]], ptr addrspace(1) [[C:%.*]], i64 [[D:%.*]]) {