From 99d4c722e30816aca6698e20345bffab953d9d79 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 8 Jan 2023 12:05:27 -0500 Subject: [PATCH] AMDGPU: Really invert handling of enqueued block detection Remove the broken call graph analysis in the block enqueue lowering pass. The previous iteration was reverted due to a runtime bug when the completion action was unconditionally enabled. --- .../Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp | 12 ++----- .../AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp | 38 ---------------------- llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll | 29 ++++++++--------- .../AMDGPU/hsa-metadata-enqueue-kernel-v3.ll | 4 +-- .../CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll | 2 +- .../AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll | 2 +- .../AMDGPU/hsa-metadata-from-llvm-ir-full.ll | 2 +- .../CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll | 4 +-- .../CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll | 3 ++ .../CodeGen/AMDGPU/hsa-metadata-hidden-args.ll | 4 +-- 10 files changed, 29 insertions(+), 71 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index d26c700..9c60ce3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -418,9 +418,7 @@ void MetadataStreamerYamlV2::emitHiddenKernelArgs(const Function &Func, } if (HiddenArgNumBytes >= 48) { - if (!Func.hasFnAttribute("amdgpu-no-completion-action") && - // FIXME: Hack for runtime bug if we fail to optimize this out - Func.hasFnAttribute("calls-enqueue-kernel")) { + if (!Func.hasFnAttribute("amdgpu-no-completion-action")) { emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenCompletionAction); } else { emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone); @@ -854,9 +852,7 @@ void MetadataStreamerMsgPackV3::emitHiddenKernelArgs( } if (HiddenArgNumBytes >= 48) { - if (!Func.hasFnAttribute("amdgpu-no-completion-action") && - // FIXME: Hack for runtime bug if we fail to optimize this out - Func.hasFnAttribute("calls-enqueue-kernel")) { + if (!Func.hasFnAttribute("amdgpu-no-completion-action")) { emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset, Args); } else { @@ -1082,9 +1078,7 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs( Offset += 8; // Skipped. } - if (!Func.hasFnAttribute("amdgpu-no-completion-action") && - // FIXME: Hack for runtime bug - Func.hasFnAttribute("calls-enqueue-kernel")) { + if (!Func.hasFnAttribute("amdgpu-no-completion-action")) { emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset, Args); } else { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp index 98c5c96..2092707 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -72,31 +72,6 @@ ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() { return new AMDGPUOpenCLEnqueuedBlockLowering(); } -/// Collect direct or indirect callers of \p F and save them -/// to \p Callers. -static void collectCallers(Function *F, DenseSet &Callers) { - for (auto *U : F->users()) { - if (auto *CI = dyn_cast(&*U)) { - auto *Caller = CI->getParent()->getParent(); - if (Callers.insert(Caller).second) - collectCallers(Caller, Callers); - } - } -} - -/// If \p U is instruction or constant, collect functions which directly or -/// indirectly use it. -static void collectFunctionUsers(User *U, DenseSet &Funcs) { - if (auto *I = dyn_cast(U)) { - auto *F = I->getParent()->getParent(); - if (Funcs.insert(F).second) - collectCallers(F, Funcs); - return; - } - for (User *U : U->users()) - collectFunctionUsers(U, Funcs); -} - bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { DenseSet Callers; auto &C = M.getContext(); @@ -131,9 +106,6 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { /*isExternallyInitialized=*/true); LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n'); - for (User *U : F.users()) - collectFunctionUsers(U, Callers); - F.replaceAllUsesWith(ConstantExpr::getAddrSpaceCast(GV, F.getType())); F.addFnAttr("runtime-handle", RuntimeHandle); F.setLinkage(GlobalValue::ExternalLinkage); @@ -141,15 +113,5 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { } } - // FIXME: This call graph analysis is broken and should be - // removed. AMDGPUAttributor infers the individual implicit argument fields - // are needed or not, but the runtime crashes in cases where we fail to - // optimize these out at -O0. - for (auto *F : Callers) { - if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL) - continue; - F->addFnAttr("calls-enqueue-kernel"); - LLVM_DEBUG(dbgs() << "mark enqueue_kernel caller:" << F->getName() << '\n'); - } return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll b/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll index d0fb84c..64a4f72 100644 --- a/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll @@ -121,13 +121,13 @@ attributes #0 = { "enqueued-block" } ; ; ; CHECK-LABEL: define {{[^@]+}}@caller_indirect -; CHECK-SAME: (ptr addrspace(1) [[A:%.*]], i8 [[B:%.*]], ptr addrspace(1) [[C:%.*]], i64 [[D:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SAME: (ptr addrspace(1) [[A:%.*]], i8 [[B:%.*]], ptr addrspace(1) [[C:%.*]], i64 [[D:%.*]]) { ; CHECK-NEXT: call void @caller(ptr addrspace(1) [[A]], i8 [[B]], ptr addrspace(1) [[C]], i64 [[D]]) ; CHECK-NEXT: ret void ; ; ; CHECK-LABEL: define {{[^@]+}}@caller -; CHECK-SAME: (ptr addrspace(1) [[A:%.*]], i8 [[B:%.*]], ptr addrspace(1) [[C:%.*]], i64 [[D:%.*]]) #[[ATTR0]] { +; CHECK-SAME: (ptr addrspace(1) [[A:%.*]], i8 [[B:%.*]], ptr addrspace(1) [[C:%.*]], i64 [[D:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BLOCK:%.*]] = alloca <{ i32, i32, ptr addrspace(1), i8 }>, align 8, addrspace(5) ; CHECK-NEXT: [[INST:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5) @@ -164,7 +164,7 @@ attributes #0 = { "enqueued-block" } ; ; ; CHECK-LABEL: define {{[^@]+}}@inlined_caller -; CHECK-SAME: (ptr addrspace(1) [[A:%.*]], i8 [[B:%.*]], ptr addrspace(1) [[C:%.*]], i64 [[D:%.*]]) #[[ATTR0]] { +; CHECK-SAME: (ptr addrspace(1) [[A:%.*]], i8 [[B:%.*]], ptr addrspace(1) [[C:%.*]], i64 [[D:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INST:%.*]] = load i64, ptr addrspace(1) @__test_block_invoke_kernel.runtime_handle, align 4 ; CHECK-NEXT: store i64 [[INST]], ptr addrspace(1) [[C]], align 4 @@ -172,7 +172,7 @@ attributes #0 = { "enqueued-block" } ; ; ; CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_kernel -; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> [[ARG]], 2 ; CHECK-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> [[ARG]], 3 @@ -181,7 +181,7 @@ attributes #0 = { "enqueued-block" } ; ; ; CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel -; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG:%.*]]) #[[ATTR2:[0-9]+]] { +; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG]], 2 ; CHECK-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG]], 3 @@ -193,7 +193,7 @@ attributes #0 = { "enqueued-block" } ; ; ; CHECK-LABEL: define {{[^@]+}}@block_has_used_kernel_address -; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR3:[0-9]+]] { +; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> [[ARG]], 2 ; CHECK-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> [[ARG]], 3 @@ -202,25 +202,24 @@ attributes #0 = { "enqueued-block" } ; ; ; CHECK-LABEL: define {{[^@]+}}@user_of_kernel_address -; CHECK-SAME: (ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-SAME: (ptr addrspace(1) [[ARG:%.*]]) { ; CHECK-NEXT: store ptr addrspacecast (ptr addrspace(1) @block_has_used_kernel_address.runtime_handle to ptr), ptr addrspace(1) [[ARG]], align 8 ; CHECK-NEXT: ret void ; ; ; CHECK-LABEL: define {{[^@]+}}@__amdgpu_enqueued_kernel -; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR4:[0-9]+]] { +; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR3:[0-9]+]] { ; CHECK-NEXT: ret void ; ; ; CHECK-LABEL: define {{[^@]+}}@__amdgpu_enqueued_kernel.1 -; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR5:[0-9]+]] { +; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR4:[0-9]+]] { ; CHECK-NEXT: ret void ; ;. -; CHECK: attributes #[[ATTR0]] = { "calls-enqueue-kernel" } -; CHECK: attributes #[[ATTR1]] = { "enqueued-block" "runtime-handle"="__test_block_invoke_kernel.runtime_handle" } -; CHECK: attributes #[[ATTR2]] = { "enqueued-block" "runtime-handle"="__test_block_invoke_2_kernel.runtime_handle" } -; CHECK: attributes #[[ATTR3]] = { "enqueued-block" "runtime-handle"="block_has_used_kernel_address.runtime_handle" } -; CHECK: attributes #[[ATTR4]] = { "enqueued-block" "runtime-handle"="__amdgpu_enqueued_kernel.runtime_handle" } -; CHECK: attributes #[[ATTR5]] = { "enqueued-block" "runtime-handle"="__amdgpu_enqueued_kernel.1.runtime_handle" } +; CHECK: attributes #[[ATTR0]] = { "enqueued-block" "runtime-handle"="__test_block_invoke_kernel.runtime_handle" } +; CHECK: attributes #[[ATTR1]] = { "enqueued-block" "runtime-handle"="__test_block_invoke_2_kernel.runtime_handle" } +; CHECK: attributes #[[ATTR2]] = { "enqueued-block" "runtime-handle"="block_has_used_kernel_address.runtime_handle" } +; CHECK: attributes #[[ATTR3]] = { "enqueued-block" "runtime-handle"="__amdgpu_enqueued_kernel.runtime_handle" } +; CHECK: attributes #[[ATTR4]] = { "enqueued-block" "runtime-handle"="__amdgpu_enqueued_kernel.1.runtime_handle" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll index 7f7b329..37b124e 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll @@ -55,7 +55,7 @@ define amdgpu_kernel void @test_non_enqueue_kernel_caller(i8 %a) #0 ; CHECK-NEXT: .value_kind: hidden_default_queue ; CHECK-NEXT: - .offset: 48 ; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: hidden_none +; CHECK-NEXT: .value_kind: hidden_completion_action ; CHECK: .language: OpenCL C ; CHECK-NEXT: .language_version: ; CHECK-NEXT: - 2 @@ -148,7 +148,7 @@ define amdgpu_kernel void @test_no_default_queue(i8 %a) #3 attributes #0 = { optnone noinline "amdgpu-no-default-queue" "amdgpu-no-completion-action" "amdgpu-implicitarg-num-bytes"="48" } attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" } attributes #2 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-implicitarg-num-bytes"="48" } -attributes #3 = { optnone noinline "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="48" "calls-enqueue-kernel" } +attributes #3 = { optnone noinline "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="48" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdgpu_code_object_version", i32 300} diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll index 055bc23..01fa229 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll @@ -76,7 +76,7 @@ define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #1 } attributes #0 = { optnone noinline "amdgpu-no-default-queue" "amdgpu-no-completion-action" "amdgpu-implicitarg-num-bytes"="48" } -attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" "calls-enqueue-kernel" } +attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdgpu_code_object_version", i32 200} diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll index 62c4d07..69efc47 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll @@ -1743,7 +1743,7 @@ define amdgpu_kernel void @unknown_addrspace_kernarg(ptr addrspace(12345) %ptr) attributes #0 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" } attributes #1 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" } -attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" } +attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdgpu_code_object_version", i32 300} diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll index a3c5c59..726e283 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll @@ -1868,7 +1868,7 @@ define amdgpu_kernel void @unknown_addrspace_kernarg(ptr addrspace(12345) %ptr) attributes #0 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" } attributes #1 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" } -attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" } +attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdgpu_code_object_version", i32 200} diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll index 03218d2..47b8824 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll @@ -218,7 +218,7 @@ entry: ; CHECK-NEXT: .value_kind: hidden_default_queue ; CHECK-NEXT: - .offset: 64 ; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: hidden_none +; CHECK-NEXT: .value_kind: hidden_completion_action ; CHECK: .name: test48 ; CHECK: .symbol: test48.kd define amdgpu_kernel void @test48( @@ -266,7 +266,7 @@ entry: ; CHECK-NEXT: .value_kind: hidden_default_queue ; CHECK-NEXT: - .offset: 64 ; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: hidden_none +; CHECK-NEXT: .value_kind: hidden_completion_action ; CHECK-NEXT: - .offset: 72 ; CHECK-NEXT: .size: 8 ; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll index 245f0f2..cb30d66 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll @@ -78,6 +78,9 @@ ; CHECK-NEXT: - .offset: 128 ; CHECK-NEXT: .size: 8 ; CHECK-NEXT: .value_kind: hidden_default_queue +; CHECK-NEXT: - .offset: 136 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_completion_action ; GFX8-NEXT: - .offset: 216 ; GFX8-NEXT: .size: 4 ; GFX8-NEXT: .value_kind: hidden_private_base diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll index 510bc94..2b7e859 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll @@ -229,7 +229,7 @@ entry: ; CHECK-NEXT: AddrSpaceQual: Global ; CHECK-NEXT: - Size: 8 ; CHECK-NEXT: Align: 8 -; CHECK-NEXT: ValueKind: HiddenNone +; CHECK-NEXT: ValueKind: HiddenCompletionAction ; CHECK-NEXT: AddrSpaceQual: Global ; CHECK-NEXT: CodeProps: define amdgpu_kernel void @test48( @@ -281,7 +281,7 @@ entry: ; CHECK-NEXT: AddrSpaceQual: Global ; CHECK-NEXT: - Size: 8 ; CHECK-NEXT: Align: 8 -; CHECK-NEXT: ValueKind: HiddenNone +; CHECK-NEXT: ValueKind: HiddenCompletionAction ; CHECK-NEXT: AddrSpaceQual: Global ; CHECK-NEXT: - Size: 8 ; CHECK-NEXT: Align: 8 -- 2.7.4