#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/CallGraph.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/Cloning.h"
+#include <queue>
#define DEBUG_TYPE "amdgpu-propagate-attributes"
// Clone functions as needed or just set attributes.
bool AllowClone;
+ CallGraph *ModuleCG = nullptr;
+
// Option propagation roots.
SmallSet<Function *, 32> Roots;
// Clones of functions with their attributes.
SmallVector<Clone, 32> Clones;
+ // To memoize address taken functions.
+ SmallSet<Function *, 32> AddressTakenFunctions;
+
// Find a clone with required features.
Function *findFunction(const FnProperties &PropsNeeded,
Function *OrigF);
bool process();
public:
- AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) :
- TM(TM), AllowClone(AllowClone) {}
+ AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone)
+ : TM(TM), AllowClone(AllowClone) {}
// Use F as a root and propagate its attributes.
bool process(Function &F);
// Propagate attributes starting from kernel functions.
- bool process(Module &M);
+ bool process(Module &M, CallGraph *CG);
+
+ // Remove attributes from F.
+ // This is used in presence of address taken functions.
+ bool removeAttributes(Function *F);
+
+ // Handle call graph rooted at address taken functions.
+ // This function will erase all attributes present
+ // on all functions called from address taken functions transitively.
+ bool handleAddressTakenFunctions(CallGraph *CG);
};
// Allows to propagate attributes early, but no clonning is allowed as it must
*PassRegistry::getPassRegistry());
}
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
bool runOnModule(Module &M) override;
};
"Late propagate attributes from kernels to functions",
false, false)
+bool AMDGPUPropagateAttributes::removeAttributes(Function *F) {
+ bool Changed = false;
+ if (!F)
+ return Changed;
+ LLVM_DEBUG(dbgs() << "Removing attributes from " << F->getName() << '\n');
+ for (unsigned I = 0; I < NumAttr; ++I) {
+ if (F->hasFnAttribute(AttributeNames[I])) {
+ F->removeFnAttr(AttributeNames[I]);
+ Changed = true;
+ }
+ }
+ return Changed;
+}
+
+bool AMDGPUPropagateAttributes::handleAddressTakenFunctions(CallGraph *CG) {
+ assert(ModuleCG && "Call graph not present");
+
+ bool Changed = false;
+ SmallSet<CallGraphNode *, 32> Visited;
+
+ for (Function *F : AddressTakenFunctions) {
+ CallGraphNode *CGN = (*CG)[F];
+ if (!Visited.count(CGN)) {
+ Changed |= removeAttributes(F);
+ Visited.insert(CGN);
+ }
+
+ std::queue<CallGraphNode *> SubGraph;
+ SubGraph.push(CGN);
+ while (!SubGraph.empty()) {
+ CallGraphNode *CGN = SubGraph.front();
+ SubGraph.pop();
+ if (!Visited.count(CGN)) {
+ Changed |= removeAttributes(CGN->getFunction());
+ Visited.insert(CGN);
+ }
+ for (auto N : *CGN)
+ SubGraph.push(N.second);
+ }
+ }
+ return Changed;
+}
+
Function *
AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded,
Function *OrigF) {
return nullptr;
}
-bool AMDGPUPropagateAttributes::process(Module &M) {
+bool AMDGPUPropagateAttributes::process(Module &M, CallGraph *CG) {
for (auto &F : M.functions())
if (AMDGPU::isEntryFunctionCC(F.getCallingConv()))
Roots.insert(&F);
+ ModuleCG = CG;
return process();
}
if (F.isDeclaration())
continue;
+ if (F.hasAddressTaken(nullptr, true, true, true))
+ AddressTakenFunctions.insert(&F);
+
const FnProperties CalleeProps(*TM, F);
SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace;
SmallSet<CallBase *, 32> Visited;
Roots.clear();
Clones.clear();
+ // Keep the post processing related to indirect
+ // calls separate to handle them gracefully.
+ // The core traversal need not be affected by this.
+ if (AllowClone)
+ Changed |= handleAddressTakenFunctions(ModuleCG);
+
return Changed;
}
return AMDGPUPropagateAttributes(TM, false).process(F);
}
+void AMDGPUPropagateAttributesLate::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<CallGraphWrapperPass>();
+}
+
bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {
if (!TM) {
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
TM = &TPC->getTM<TargetMachine>();
}
-
- return AMDGPUPropagateAttributes(TM, true).process(M);
+ CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+ return AMDGPUPropagateAttributes(TM, true).process(M, &CG);
}
FunctionPass
}
PreservedAnalyses
-AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &AM) {
- return AMDGPUPropagateAttributes(&TM, true).process(M)
- ? PreservedAnalyses::none()
- : PreservedAnalyses::all();
+AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &MAM) {
+ AMDGPUPropagateAttributes APA(&TM, true);
+ CallGraph &CG = MAM.getResult<CallGraphAnalysis>(M);
+ const bool Changed = APA.process(M, &CG);
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}
; GCN-O1-NEXT: AMDGPU Printf lowering
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: Dominator Tree Construction
+; GCN-O1-NEXT: CallGraph Construction
; GCN-O1-NEXT: Late propagate attributes from kernels to functions
; GCN-O1-NEXT: Interprocedural Sparse Conditional Constant Propagation
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O2-NEXT: AMDGPU Printf lowering
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: Dominator Tree Construction
+; GCN-O2-NEXT: CallGraph Construction
; GCN-O2-NEXT: Late propagate attributes from kernels to functions
; GCN-O2-NEXT: Interprocedural Sparse Conditional Constant Propagation
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O3-NEXT: AMDGPU Printf lowering
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Dominator Tree Construction
+; GCN-O3-NEXT: CallGraph Construction
; GCN-O3-NEXT: Late propagate attributes from kernels to functions
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Dominator Tree Construction
--- /dev/null
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s
+
+; CHECK-LABEL: define float @common() {
+define float @common() {
+ ret float 0.0
+}
+
+; CHECK-LABEL: define float @foo() {
+define float @foo() {
+ %direct_call = call contract float @common()
+ ret float %direct_call
+}
+
+; CHECK-LABEL: define float @bar() {
+define float @bar() {
+ ret float 0.0
+}
+
+; CHECK-LABEL: define float @baz() {
+define float @baz() {
+ ret float 0.0
+}
+
+define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 {
+ %fn = alloca float ()*
+ switch i32 %type, label %sw.default [
+ i32 1, label %sw.bb
+ i32 2, label %sw.bb2
+ i32 3, label %sw.bb3
+ ]
+
+sw.bb:
+ store float ()* @foo, float ()** %fn
+ br label %sw.epilog
+
+sw.bb2:
+ store float ()* @bar, float ()** %fn
+ br label %sw.epilog
+
+sw.bb3:
+ store float ()* @baz, float ()** %fn
+ br label %sw.epilog
+
+sw.default:
+ br label %sw.epilog
+
+sw.epilog:
+ %fp = load float ()*, float ()** %fn
+ %direct_call = call contract float @common()
+ %indirect_call = call contract float %fp()
+ store float %indirect_call, float* %result
+ ret void
+}
+
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
--- /dev/null
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s
+
+; Test to check if we skip propgating attributes even if
+; a function is called directly as well as
+; indirectly. "baz" is called directly as well indirectly.
+
+; CHECK-LABEL: define float @foo()
+define float @foo() #1 {
+ ret float 0.0
+}
+
+; CHECK-LABEL: define float @bar()
+define float @bar() #1 {
+ ret float 0.0
+}
+
+; CHECK-LABEL: define float @baz()
+define float @baz() #1 {
+ ret float 0.0
+}
+
+define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 {
+ %fn = alloca float ()*
+ switch i32 %type, label %sw.default [
+ i32 1, label %sw.bb
+ i32 2, label %sw.bb2
+ i32 3, label %sw.bb3
+ ]
+
+sw.bb:
+ store float ()* @foo, float ()** %fn
+ br label %sw.epilog
+
+sw.bb2:
+ store float ()* @bar, float ()** %fn
+ br label %sw.epilog
+
+sw.bb3:
+ store float ()* @baz, float ()** %fn
+ br label %sw.epilog
+
+sw.default:
+ br label %sw.epilog
+
+sw.epilog:
+ %fp = load float ()*, float ()** %fn
+ %direct_call = call contract float @baz()
+ %indirect_call = call contract float %fp()
+ store float %indirect_call, float* %result
+ ret void
+}
+
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
; In order to expose this bug, it is necessary that `g` have one of the
; propagated attributes, so that a clone and substitution would take place if g
; were actually the function being called.
-; CHECK-DAG: define private void @g.1() #1
-; CHECK-DAG: define internal void @g() #2
+; CHECK-DAG: define private void @g.1() #0
+; CHECK-DAG: define internal void @g() #1
define private void @g() #1 {
ret void
}
attributes #0 = { noinline }
attributes #1 = { noinline "amdgpu-waves-per-eu"="1,10" }
+
+; CHECK: attributes #0 = { noinline }
+; CHECK-NEXT: attributes #1 = { noinline "target-features"="+enable-ds128,+enable-prt-strict-null,+flat-address-space,+flat-for-global,+load-store-opt,+promote-alloca,+trap-handler,+unaligned-access-mode,-wavefrontsize16,-wavefrontsize32,+wavefrontsize64" }
--- /dev/null
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s
+
+; Test to check if we skip attributes on address
+; taken functions and its callees.
+
+; CHECK-LABEL: define float @bar() {
+define float @bar() {
+ ret float 0.0
+}
+
+; CHECK-LABEL: define float @baz() {
+define float @baz() {
+ ret float 0.0
+}
+
+; CHECK-LABEL: define float @foo() {
+define float @foo() {
+ %v1 = call contract float @bar()
+ %v2 = call contract float @baz()
+ %v3 = fadd float %v1, %v2
+ ret float %v3
+}
+
+; CHECK-LABEL: define amdgpu_kernel void @kernel(float* %result, i32 %type) #0 {
+define amdgpu_kernel void @kernel(float *%result, i32 %type) #1 {
+ %fn = alloca float ()*
+ store float ()* @foo, float ()** %fn
+ %fp = load float ()*, float ()** %fn
+ %indirect_call = call contract float %fp()
+ store float %indirect_call, float* %result
+ ret void
+}
+
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
--- /dev/null
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s
+
+; Test attributes on a function which
+; is called indirectly from two kernels
+; having different launch bounds.
+
+; This function should not have any attributes on it.
+; CHECK-LABEL: define float @foo() {
+define float @foo() {
+ ret float 0.0
+}
+
+define amdgpu_kernel void @kernel1(float *%result, i32 %type) #1 {
+ %fn = alloca float ()*
+ store float ()* @foo, float ()** %fn
+ %fp = load float ()*, float ()** %fn
+ %indirect_call = call contract float %fp()
+ store float %indirect_call, float* %result
+ ret void
+}
+
+define amdgpu_kernel void @kernel2(float *%result, i32 %type) #2 {
+ %fn = alloca float ()*
+ store float ()* @foo, float ()** %fn
+ %fp = load float ()*, float ()** %fn
+ %indirect_call = call contract float %fp()
+ store float %indirect_call, float* %result
+ ret void
+}
+
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
+attributes #2 = { "amdgpu-flat-work-group-size"="1,512" }