From c56d2afc63540f2e85fee98617bea8d63bc713d4 Mon Sep 17 00:00:00 2001 From: Aakanksha Patil Date: Thu, 7 Mar 2019 00:54:04 +0000 Subject: [PATCH] AMDGPU: Handle "uniform-work-group-size" attribute (fix for RADV) A previous patch for "uniform-work-group-size" attribute was found to break some RADV and possibly radeon SI tests and had to be retracted. This patch fixes that. Differential Revision: http://reviews.llvm.org/D58993 llvm-svn: 355574 --- .../Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp | 68 ++++++++++++++++++++-- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +- .../AMDGPU/annotate-kernel-features-hsa-call.ll | 50 ++++++++-------- .../AMDGPU/uniform-work-group-attribute-missing.ll | 18 ++++++ .../uniform-work-group-nested-function-calls.ll | 24 ++++++++ ...orm-work-group-prevent-attribute-propagation.ll | 25 ++++++++ .../uniform-work-group-propagate-attribute.ll | 33 +++++++++++ .../AMDGPU/uniform-work-group-recursion-test.ll | 37 ++++++++++++ .../test/CodeGen/AMDGPU/uniform-work-group-test.ll | 35 +++++++++++ 9 files changed, 264 insertions(+), 30 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll create mode 100644 llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll create mode 100644 llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll create mode 100644 llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll create mode 100644 llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll create mode 100644 llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index 96e7603..419ebb22 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -45,8 +45,11 @@ namespace { class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { private: const TargetMachine *TM = nullptr; + SmallVector NodeList; bool addFeatureAttributes(Function &F); + bool processUniformWorkGroupAttribute(); + bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee); public: static char ID; @@ -185,7 +188,6 @@ static bool handleAttr(Function &Parent, const Function &Callee, Parent.addFnAttr(Name); return true; } - return false; } @@ -212,6 +214,56 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee, handleAttr(Parent, Callee, AttrName); } +bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() { + bool Changed = false; + + for (auto *Node : reverse(NodeList)) { + Function *Caller = Node->getFunction(); + + for (auto I : *Node) { + Function *Callee = std::get<1>(I)->getFunction(); + if (Callee) + Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee); + } + } + + return Changed; +} + +bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute( + Function &Caller, Function &Callee) { + + // Check for externally defined function + if (!Callee.hasExactDefinition()) { + Callee.addFnAttr("uniform-work-group-size", "false"); + if (!Caller.hasFnAttribute("uniform-work-group-size")) + Caller.addFnAttr("uniform-work-group-size", "false"); + + return true; + } + // Check if the Caller has the attribute + if (Caller.hasFnAttribute("uniform-work-group-size")) { + // Check if the value of the attribute is true + if (Caller.getFnAttribute("uniform-work-group-size") + .getValueAsString().equals("true")) { + // Propagate the attribute to the Callee, if it does not have it + if (!Callee.hasFnAttribute("uniform-work-group-size")) { + Callee.addFnAttr("uniform-work-group-size", "true"); + return true; + } + } else { + Callee.addFnAttr("uniform-work-group-size", "false"); + return true; + } + } else { + // If the attribute is absent, set it as false + Caller.addFnAttr("uniform-work-group-size", "false"); + Callee.addFnAttr("uniform-work-group-size", "false"); + return true; + } + return false; +} + bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { const GCNSubtarget &ST = TM->getSubtarget(F); bool HasFlat = ST.hasFlatAddressSpace(); @@ -292,15 +344,21 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { } bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { - Module &M = SCC.getCallGraph().getModule(); - Triple TT(M.getTargetTriple()); - bool Changed = false; + for (CallGraphNode *I : SCC) { + // Build a list of CallGraphNodes from most number of uses to least + if (I->getNumReferences()) + NodeList.push_back(I); + else { + processUniformWorkGroupAttribute(); + NodeList.clear(); + } + Function *F = I->getFunction(); + // Add feature attributes if (!F || F->isDeclaration()) continue; - Changed |= addFeatureAttributes(*F); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 60c1799..dd76390 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -686,6 +686,9 @@ void AMDGPUPassConfig::addIRPasses() { } void AMDGPUPassConfig::addCodeGenPrepare() { + if (TM->getTargetTriple().getArch() == Triple::amdgcn) + addPass(createAMDGPUAnnotateKernelFeaturesPass()); + if (TM->getTargetTriple().getArch() == Triple::amdgcn && EnableLowerKernelArguments) addPass(createAMDGPULowerKernelArgumentsPass()); @@ -773,7 +776,6 @@ bool GCNPassConfig::addPreISel() { // FIXME: We need to run a pass to propagate the attributes when calls are // supported. - addPass(createAMDGPUAnnotateKernelFeaturesPass()); // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll index e68b179..ca6739f 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -244,52 +244,52 @@ define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 { ret void } -; HSA: define void @use_implicitarg_ptr() #15 { +; HSA: define void @use_implicitarg_ptr() #16 { define void @use_implicitarg_ptr() #1 { %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef ret void } -; HSA: define void @func_indirect_use_implicitarg_ptr() #15 { +; HSA: define void @func_indirect_use_implicitarg_ptr() #16 { define void @func_indirect_use_implicitarg_ptr() #1 { call void @use_implicitarg_ptr() ret void } -; HSA: declare void @external.func() #16 +; HSA: declare void @external.func() #17 declare void @external.func() #3 -; HSA: define internal void @defined.func() #16 { +; HSA: define internal void @defined.func() #17 { define internal void @defined.func() #3 { ret void } -; HSA: define void @func_call_external() #16 { +; HSA: define void @func_call_external() #17 { define void @func_call_external() #3 { call void @external.func() ret void } -; HSA: define void @func_call_defined() #16 { +; HSA: define void @func_call_defined() #17 { define void @func_call_defined() #3 { call void @defined.func() ret void } -; HSA: define void @func_call_asm() #16 { +; HSA: define void @func_call_asm() #18 { define void @func_call_asm() #3 { call void asm sideeffect "", ""() #3 ret void } -; HSA: define amdgpu_kernel void @kern_call_external() #17 { +; HSA: define amdgpu_kernel void @kern_call_external() #19 { define amdgpu_kernel void @kern_call_external() #3 { call void @external.func() ret void } -; HSA: define amdgpu_kernel void @func_kern_defined() #17 { +; HSA: define amdgpu_kernel void @func_kern_defined() #19 { define amdgpu_kernel void @func_kern_defined() #3 { call void @defined.func() ret void @@ -301,20 +301,22 @@ attributes #2 = { nounwind "target-cpu"="gfx900" } attributes #3 = { nounwind } ; HSA: attributes #0 = { nounwind readnone speculatable } -; HSA: attributes #1 = { nounwind "amdgpu-work-item-id-x" "target-cpu"="fiji" } -; HSA: attributes #2 = { nounwind "amdgpu-work-item-id-y" "target-cpu"="fiji" } -; HSA: attributes #3 = { nounwind "amdgpu-work-item-id-z" "target-cpu"="fiji" } -; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-x" "target-cpu"="fiji" } -; HSA: attributes #5 = { nounwind "amdgpu-work-group-id-y" "target-cpu"="fiji" } -; HSA: attributes #6 = { nounwind "amdgpu-work-group-id-z" "target-cpu"="fiji" } -; HSA: attributes #7 = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" } -; HSA: attributes #8 = { nounwind "amdgpu-queue-ptr" "target-cpu"="fiji" } -; HSA: attributes #9 = { nounwind "amdgpu-dispatch-id" "target-cpu"="fiji" } +; HSA: attributes #1 = { nounwind "amdgpu-work-item-id-x" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #2 = { nounwind "amdgpu-work-item-id-y" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #3 = { nounwind "amdgpu-work-item-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-x" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #5 = { nounwind "amdgpu-work-group-id-y" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #6 = { nounwind "amdgpu-work-group-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #7 = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #8 = { nounwind "amdgpu-queue-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #9 = { nounwind "amdgpu-dispatch-id" "target-cpu"="fiji" "uniform-work-group-size"="false" } ; HSA: attributes #10 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "target-cpu"="fiji" } -; HSA: attributes #11 = { nounwind "target-cpu"="fiji" } -; HSA: attributes #12 = { nounwind "target-cpu"="gfx900" } -; HSA: attributes #13 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" } -; HSA: attributes #14 = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" } +; HSA: attributes #11 = { nounwind "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #12 = { nounwind "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; HSA: attributes #13 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; HSA: attributes #14 = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } ; HSA: attributes #15 = { nounwind "amdgpu-implicitarg-ptr" "target-cpu"="fiji" } -; HSA: attributes #16 = { nounwind } -; HSA: attributes #17 = { nounwind "amdgpu-flat-scratch" } +; HSA: attributes #16 = { nounwind "amdgpu-implicitarg-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #17 = { nounwind "uniform-work-group-size"="false" } +; HSA: attributes #18 = { nounwind } +; HSA: attributes #19 = { nounwind "amdgpu-flat-scratch" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll new file mode 100644 index 0000000..51002e8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll @@ -0,0 +1,18 @@ +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s + +; If the kernel does not have the uniform-work-group-attribute, set both callee and caller as false + +; CHECK: define void @foo() #[[FOO:[0-9]+]] { +define void @foo() #0 { + ret void +} + +; CHECK: define amdgpu_kernel void @kernel1() #[[FOO]] { +define amdgpu_kernel void @kernel1() #1 { + call void @foo() + ret void +} + +attributes #0 = { "uniform-work-group-size"="true" } + +; CHECK: attributes #[[FOO]] = { "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll new file mode 100644 index 0000000..8376114 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll @@ -0,0 +1,24 @@ +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s + +; Test to verify if the attribute gets propagated across nested function calls + +; CHECK: define void @func1() #[[FUNC:[0-9]+]] { +define void @func1() #0 { + ret void +} + +; CHECK: define void @func2() #[[FUNC]] { +define void @func2() #1 { + call void @func1() + ret void +} + +; CHECK: define amdgpu_kernel void @kernel3() #[[FUNC:[0-9]+]] { +define amdgpu_kernel void @kernel3() #2 { + call void @func2() + ret void +} + +attributes #2 = { "uniform-work-group-size"="true" } + +; CHECK: attributes #[[FUNC]] = { "uniform-work-group-size"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll new file mode 100644 index 0000000..4a332f6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll @@ -0,0 +1,25 @@ +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s + +; Two kernels with different values of the uniform-work-group-attribute call the same function + +; CHECK: define void @func() #[[FUNC:[0-9]+]] { +define void @func() #0 { + ret void +} + +; CHECK: define amdgpu_kernel void @kernel1() #[[KERNEL1:[0-9]+]] { +define amdgpu_kernel void @kernel1() #1 { + call void @func() + ret void +} + +; CHECK: define amdgpu_kernel void @kernel2() #[[FUNC]] { +define amdgpu_kernel void @kernel2() #2 { + call void @func() + ret void +} + +attributes #1 = { "uniform-work-group-size"="true" } + +; CHECK: attributes #[[FUNC]] = { "uniform-work-group-size"="false" } +; CHECK: attributes #[[KERNEL1]] = { "uniform-work-group-size"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll new file mode 100644 index 0000000..15131a4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll @@ -0,0 +1,33 @@ +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s + +; Propagate the uniform-work-group-attribute from the kernel to callee if it doesn't have it +; CHECK: define void @func() #[[FUNC:[0-9]+]] { +define void @func() #0 { + ret void +} + +; CHECK: define amdgpu_kernel void @kernel1() #[[KERNEL1:[0-9]+]] { +define amdgpu_kernel void @kernel1() #1 { + call void @func() + ret void +} + +; External declaration of a function +; CHECK: define weak_odr void @weak_func() #[[FUNC]] { +define weak_odr void @weak_func() #0 { + ret void +} + +; CHECK: define amdgpu_kernel void @kernel2() #[[KERNEL2:[0-9]+]] { +define amdgpu_kernel void @kernel2() #2 { + call void @weak_func() + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { "uniform-work-group-size"="false" } +attributes #2 = { "uniform-work-group-size"="true" } + +; CHECK: attributes #[[FUNC]] = { nounwind "uniform-work-group-size"="false" } +; CHECK: attributes #[[KERNEL1]] = { "uniform-work-group-size"="false" } +; CHECK: attributes #[[KERNEL2]] = { "uniform-work-group-size"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll new file mode 100644 index 0000000..9d07a88 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll @@ -0,0 +1,37 @@ +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s + +; Test to ensure recursive functions exhibit proper behaviour +; Test to generate fibonacci numbers + +; CHECK: define i32 @fib(i32 %n) #[[FIB:[0-9]+]] { +define i32 @fib(i32 %n) #0 { + %cmp1 = icmp eq i32 %n, 0 + br i1 %cmp1, label %exit, label %cont1 + +cont1: + %cmp2 = icmp eq i32 %n, 1 + br i1 %cmp2, label %exit, label %cont2 + +cont2: + %nm1 = sub i32 %n, 1 + %fibm1 = call i32 @fib(i32 %nm1) + %nm2 = sub i32 %n, 2 + %fibm2 = call i32 @fib(i32 %nm2) + %retval = add i32 %fibm1, %fibm2 + + ret i32 %retval + +exit: + ret i32 1 +} + +; CHECK: define amdgpu_kernel void @kernel(i32 addrspace(1)* %m) #[[FIB]] { +define amdgpu_kernel void @kernel(i32 addrspace(1)* %m) #1 { + %r = call i32 @fib(i32 5) + store i32 %r, i32 addrspace(1)* %m + ret void +} + +attributes #1 = { "uniform-work-group-size"="true" } + +; CHECK: attributes #[[FIB]] = { "uniform-work-group-size"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll new file mode 100644 index 0000000..0a3cae8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll @@ -0,0 +1,35 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck %s + +; CHECK: define void @func1() #[[FUNC:[0-9]+]] { +define void @func1() { + ret void +} + +; CHECK: define void @func4() #[[FUNC]] { +define void @func4() { + ret void +} + +; CHECK: define void @func2() #[[FUNC]] { +define void @func2() #0 { + call void @func4() + call void @func1() + ret void +} + +; CHECK: define void @func3() #[[FUNC]] { +define void @func3() { + call void @func1() + ret void +} + +; CHECK: define amdgpu_kernel void @kernel3() #[[FUNC:[0-9]+]] { +define amdgpu_kernel void @kernel3() #0 { + call void @func2() + call void @func3() + ret void +} + +attributes #0 = { "uniform-work-group-size"="false" } + +; CHECK: attributes #[[FUNC]] = { "amdgpu-flat-scratch" "uniform-work-group-size"="false" } -- 2.7.4