#include "AMDGPURegBankSelect.h"
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
+#include "AMDGPUUnifyDivergentExitNodes.h"
#include "GCNIterativeScheduler.h"
#include "GCNSchedStrategy.h"
#include "GCNVOPDUtils.h"
PM.addPass(AMDGPUPromoteKernelArgumentsPass());
return true;
}
+ if (PassName == "amdgpu-unify-divergent-exit-nodes") {
+ PM.addPass(AMDGPUUnifyDivergentExitNodesPass());
+ return true;
+ }
return false;
});
//
//===----------------------------------------------------------------------===//
+#include "AMDGPUUnifyDivergentExitNodes.h"
#include "AMDGPU.h"
#include "SIDefines.h"
#include "llvm/ADT/ArrayRef.h"
namespace {
-class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
+class AMDGPUUnifyDivergentExitNodesImpl {
private:
const TargetTransformInfo *TTI = nullptr;
public:
- static char ID; // Pass identification, replacement for typeid
-
- AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
- initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
- }
+ AMDGPUUnifyDivergentExitNodesImpl() = delete;
+ AMDGPUUnifyDivergentExitNodesImpl(const TargetTransformInfo *TTI)
+ : TTI(TTI) {}
// We can preserve non-critical-edgeness when we unify function exit nodes
- void getAnalysisUsage(AnalysisUsage &AU) const override;
BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
ArrayRef<BasicBlock *> ReturningBlocks,
StringRef Name);
- bool runOnFunction(Function &F) override;
+ bool run(Function &F, DominatorTree &DT, const PostDominatorTree &PDT,
+ const UniformityInfo &UA);
};
+class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
+public:
+ static char ID;
+ AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
+ initializeAMDGPUUnifyDivergentExitNodesPass(
+ *PassRegistry::getPassRegistry());
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnFunction(Function &F) override;
+};
} // end anonymous namespace
char AMDGPUUnifyDivergentExitNodes::ID = 0;
char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
- "Unify divergent function exit nodes", false, false)
+ "Unify divergent function exit nodes", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
"Unify divergent function exit nodes", false, false)
-void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
+void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const {
if (RequireAndPreserveDomTree)
AU.addRequired<DominatorTreeWrapperPass>();
return true;
}
-BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
+BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
Function &F, DomTreeUpdater &DTU, ArrayRef<BasicBlock *> ReturningBlocks,
StringRef Name) {
// Otherwise, we need to insert a new basic block into the function, add a PHI
return NewRetBlock;
}
-bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
- DominatorTree *DT = nullptr;
- if (RequireAndPreserveDomTree)
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
- auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree &DT,
+ const PostDominatorTree &PDT,
+ const UniformityInfo &UA) {
if (PDT.root_size() == 0 ||
(PDT.root_size() == 1 &&
!isa<BranchInst>(PDT.getRoot()->getTerminator())))
return false;
- UniformityInfo &UA =
- getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
- TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-
// Loop over all of the blocks in a function, tracking all of the blocks that
// return.
SmallVector<BasicBlock *, 4> ReturningBlocks;
unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock");
return true;
}
+
+bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
+ DominatorTree *DT = nullptr;
+ if (RequireAndPreserveDomTree)
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ const auto &PDT =
+ getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+ const auto &UA = getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+ const auto *TranformInfo =
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ return AMDGPUUnifyDivergentExitNodesImpl(TranformInfo).run(F, *DT, PDT, UA);
+}
+
+PreservedAnalyses
+AMDGPUUnifyDivergentExitNodesPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ DominatorTree *DT = nullptr;
+ if (RequireAndPreserveDomTree)
+ DT = &AM.getResult<DominatorTreeAnalysis>(F);
+
+ const auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+ const auto &UA = AM.getResult<UniformityInfoAnalysis>(F);
+ const auto *TransformInfo = &AM.getResult<TargetIRAnalysis>(F);
+ return AMDGPUUnifyDivergentExitNodesImpl(TransformInfo).run(F, *DT, PDT, UA)
+ ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
--- /dev/null
+//===- AMDGPUUnifyDivergentExitNodes.h ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a variant of the UnifyFunctionExitNodes pass. Rather than ensuring
+// there is at most one ret and one unreachable instruction, it ensures there is
+// at most one divergent exiting block.
+//
+// StructurizeCFG can't deal with multi-exit regions formed by branches to
+// multiple return nodes. It is not desirable to structurize regions with
+// uniform branches, so unifying those to the same return block as divergent
+// branches inhibits use of scalar branching. It still can't deal with the case
+// where one branch goes to return, and one unreachable. Replace unreachable in
+// this case with a return.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+
+namespace llvm {
+class AMDGPUUnifyDivergentExitNodesPass
+ : public PassInfoMixin<AMDGPUUnifyDivergentExitNodesPass> {
+public:
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // end namespace llvm
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -p simplifycfg,amdgpu-unify-divergent-exit-nodes %s -S -o - | FileCheck %s --check-prefix=OPT
+; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA
define void @nested_inf_loop(i1 %0, i1 %1) {
-; CHECK-LABEL: nested_inf_loop:
-; CHECK-NEXT: %bb.0: ; %BB
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v1, 1, v1
-; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT: s_xor_b64 s[6:7], vcc, -1
-; CHECK-NEXT: s_mov_b64 s[8:9], 0
-; CHECK-NEXT: .LBB0_1: ; %BB1
-; CHECK: s_and_b64 s[10:11], exec, s[6:7]
-; CHECK-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
-; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; CHECK-NEXT: s_cbranch_execnz .LBB0_1
-; CHECK-NEXT: %bb.2: ; %BB2
-; CHECK: s_or_b64 exec, exec, s[8:9]
-; CHECK-NEXT: s_mov_b64 s[8:9], 0
-; CHECK-NEXT: .LBB0_3: ; %BB4
-; CHECK: s_and_b64 s[10:11], exec, s[4:5]
-; CHECK-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
-; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; CHECK-NEXT: s_cbranch_execnz .LBB0_3
-; CHECK-NEXT: %bb.4: ; %loop.exit.guard
-; CHECK: s_or_b64 exec, exec, s[8:9]
-; CHECK-NEXT: s_mov_b64 vcc, 0
-; CHECK-NEXT: s_mov_b64 s[8:9], 0
-; CHECK-NEXT: s_branch .LBB0_1
-; CHECK-NEXT: %bb.5: ; %DummyReturnBlock
-; CHECK-NEXT: s_setpc_b64 s[30:31]
+; OPT-LABEL: @nested_inf_loop(
+; OPT-NEXT: BB:
+; OPT-NEXT: br label [[BB1:%.*]]
+; OPT: BB1:
+; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]]
+; OPT-NEXT: br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]]
+; OPT: infloop:
+; OPT-NEXT: br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]]
+; OPT: DummyReturnBlock:
+; OPT-NEXT: ret void
+;
+; ISA-LABEL: nested_inf_loop:
+; ISA-NEXT: %bb.0: ; %BB
+; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ISA-NEXT: v_and_b32_e32 v1, 1, v1
+; ISA-NEXT: v_and_b32_e32 v0, 1, v0
+; ISA-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
+; ISA-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; ISA-NEXT: s_xor_b64 s[6:7], vcc, -1
+; ISA-NEXT: s_mov_b64 s[8:9], 0
+; ISA-NEXT: .LBB0_1: ; %BB1
+; ISA: s_and_b64 s[10:11], exec, s[6:7]
+; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
+; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; ISA-NEXT: s_cbranch_execnz .LBB0_1
+; ISA-NEXT: %bb.2: ; %BB2
+; ISA: s_or_b64 exec, exec, s[8:9]
+; ISA-NEXT: s_mov_b64 s[8:9], 0
+; ISA-NEXT: .LBB0_3: ; %BB4
+; ISA: s_and_b64 s[10:11], exec, s[4:5]
+; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
+; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; ISA-NEXT: s_cbranch_execnz .LBB0_3
+; ISA-NEXT: %bb.4: ; %loop.exit.guard
+; ISA: s_or_b64 exec, exec, s[8:9]
+; ISA-NEXT: s_mov_b64 vcc, 0
+; ISA-NEXT: s_mov_b64 s[8:9], 0
+; ISA-NEXT: s_branch .LBB0_1
+; ISA-NEXT: %bb.5: ; %DummyReturnBlock
+; ISA-NEXT: s_setpc_b64 s[30:31]
BB:
br label %BB1