From 6ddf2a824da97c81d7963c082c62640e8173b5b0 Mon Sep 17 00:00:00 2001 From: Ivan Kosarev Date: Wed, 27 Apr 2022 14:07:52 +0100 Subject: [PATCH] [AMDGPU] Adjust wave priority based on VMEM instructions to avoid duty-cycling. As older waves execute long sequences of VALU instructions, this may prevent younger waves from address calculation and then issuing their VMEM loads, which in turn leads the VALU unit to idle. This patch tries to prevent this by temporarily raising the wave's priority. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D124246 --- llvm/lib/Target/AMDGPU/AMDGPU.h | 3 + llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp | 166 +++++++++++++++++++++++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 6 + llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 + llvm/test/CodeGen/AMDGPU/set-wave-priority.ll | 153 +++++++++++++++++++++ 5 files changed, 329 insertions(+) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp create mode 100644 llvm/test/CodeGen/AMDGPU/set-wave-priority.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 8926380..33f59ad 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -331,6 +331,9 @@ extern char &GCNNSAReassignID; void initializeGCNPreRAOptimizationsPass(PassRegistry &); extern char &GCNPreRAOptimizationsID; +FunctionPass *createAMDGPUSetWavePriorityPass(); +void initializeAMDGPUSetWavePriorityPass(PassRegistry &); + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp new file mode 100644 index 0000000..34702ee --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp @@ -0,0 +1,166 @@ +//===- AMDGPUSetWavePriority.cpp - Set wave priority ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Pass to temporarily raise the wave priority beginning the start of +/// the shader function until its last VMEM instructions to allow younger +/// waves to issue their VMEM instructions as well. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Allocator.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-set-wave-priority" + +namespace { + +struct MBBInfo { + MBBInfo() = default; + bool MayReachVMEMLoad = false; +}; + +using MBBInfoSet = DenseMap; + +class AMDGPUSetWavePriority : public MachineFunctionPass { +public: + static char ID; + + AMDGPUSetWavePriority() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return "Set wave priority"; } + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + MachineInstr *BuildSetprioMI(MachineFunction &MF, unsigned priority) const; + + const SIInstrInfo *TII; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(AMDGPUSetWavePriority, DEBUG_TYPE, "Set wave priority", false, + false) + +char AMDGPUSetWavePriority::ID = 0; + +FunctionPass *llvm::createAMDGPUSetWavePriorityPass() { + return new AMDGPUSetWavePriority(); +} + +MachineInstr *AMDGPUSetWavePriority::BuildSetprioMI(MachineFunction &MF, + unsigned priority) const { + return BuildMI(MF, DebugLoc(), TII->get(AMDGPU::S_SETPRIO)).addImm(priority); +} + +// Checks that for every predecessor Pred that can reach a VMEM load, +// none of Pred's successors can reach a VMEM load. +static bool CanLowerPriorityDirectlyInPredecessors(const MachineBasicBlock &MBB, + MBBInfoSet &MBBInfos) { + for (const MachineBasicBlock *Pred : MBB.predecessors()) { + if (!MBBInfos[Pred].MayReachVMEMLoad) + continue; + for (const MachineBasicBlock *Succ : Pred->successors()) { + if (MBBInfos[Succ].MayReachVMEMLoad) + return false; + } + } + return true; +} + +static bool isVMEMLoad(const MachineInstr &MI) { + return SIInstrInfo::isVMEM(MI) && MI.mayLoad(); +} + +bool AMDGPUSetWavePriority::runOnMachineFunction(MachineFunction &MF) { + const unsigned HighPriority = 3; + const unsigned LowPriority = 0; + + Function &F = MF.getFunction(); + if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv())) + return false; + + const GCNSubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + + MBBInfoSet MBBInfos; + SmallVector Worklist; + for (MachineBasicBlock &MBB : MF) { + if (any_of(MBB, isVMEMLoad)) + Worklist.push_back(&MBB); + } + + // Mark blocks from which control may reach VMEM loads. + while (!Worklist.empty()) { + const MachineBasicBlock *MBB = Worklist.pop_back_val(); + MBBInfo &Info = MBBInfos[MBB]; + if (!Info.MayReachVMEMLoad) { + Info.MayReachVMEMLoad = true; + Worklist.append(MBB->pred_begin(), MBB->pred_end()); + } + } + + MachineBasicBlock &Entry = MF.front(); + if (!MBBInfos[&Entry].MayReachVMEMLoad) + return false; + + // Raise the priority at the beginning of the shader. + MachineBasicBlock::iterator I = Entry.begin(), E = Entry.end(); + while (I != E && !SIInstrInfo::isVALU(*I) && !I->isTerminator()) + ++I; + Entry.insert(I, BuildSetprioMI(MF, HighPriority)); + + // Lower the priority on edges where control leaves blocks from which + // VMEM loads are reachable. + SmallSet PriorityLoweringBlocks; + for (MachineBasicBlock &MBB : MF) { + if (MBBInfos[&MBB].MayReachVMEMLoad) { + if (MBB.succ_empty()) + PriorityLoweringBlocks.insert(&MBB); + continue; + } + + if (CanLowerPriorityDirectlyInPredecessors(MBB, MBBInfos)) { + for (MachineBasicBlock *Pred : MBB.predecessors()) { + if (MBBInfos[Pred].MayReachVMEMLoad) + PriorityLoweringBlocks.insert(Pred); + } + continue; + } + + // Where lowering the priority in predecessors is not possible, the + // block receiving control either was not part of a loop in the first + // place or the loop simplification/canonicalization pass should have + // already tried to split the edge and insert a preheader, and if for + // whatever reason it failed to do so, then this leaves us with the + // only option of lowering the priority within the loop. + PriorityLoweringBlocks.insert(&MBB); + } + + for (MachineBasicBlock *MBB : PriorityLoweringBlocks) { + MachineBasicBlock::iterator I = MBB->end(), B = MBB->begin(); + while (I != B) { + if (isVMEMLoad(*--I)) { + ++I; + break; + } + } + MBB->insert(I, BuildSetprioMI(MF, LowPriority)); + } + + return true; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 8db5d63..afc8fd3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -277,6 +277,10 @@ EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden, cl::desc("Enable machine DCE inside regalloc")); +static cl::opt EnableSetWavePriority("amdgpu-set-wave-priority", + cl::desc("Adjust wave priority"), + cl::init(false), cl::Hidden); + static cl::opt EnableScalarIRPasses( "amdgpu-scalar-ir-passes", cl::desc("Enable scalar IR passes"), @@ -1360,6 +1364,8 @@ void GCNPassConfig::addPreEmitPass() { addPass(&SIInsertHardClausesID); addPass(&SILateBranchLoweringPassID); + if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less)) + addPass(createAMDGPUSetWavePriorityPass()); if (getOptLevel() > CodeGenOpt::None) addPass(&SIPreEmitPeepholeID); // The hazard recognizer that runs as part of the post-ra scheduler does not diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 586c707..39685c3 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -89,6 +89,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUReplaceLDSUseWithPointer.cpp AMDGPUResourceUsageAnalysis.cpp AMDGPURewriteOutArguments.cpp + AMDGPUSetWavePriority.cpp AMDGPUSubtarget.cpp AMDGPUTargetMachine.cpp AMDGPUTargetObjectFile.cpp diff --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll new file mode 100644 index 0000000..ed7dc62 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll @@ -0,0 +1,153 @@ +; RUN: llc -mtriple=amdgcn -amdgpu-set-wave-priority=true -o - %s | \ +; RUN: FileCheck %s + +; CHECK-LABEL: no_setprio: +; CHECK-NOT: s_setprio +; CHECK: ; return to shader part epilog +define amdgpu_ps <2 x float> @no_setprio() { + ret <2 x float> +} + +; CHECK-LABEL: vmem_in_exit_block: +; CHECK: s_setprio 3 +; CHECK: buffer_load_dwordx2 +; CHECK-NEXT: s_setprio 0 +; CHECK: ; return to shader part epilog +define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p) { + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + ret <2 x float> %v +} + +; CHECK-LABEL: branch: +; CHECK: s_setprio 3 +; CHECK: s_cbranch_scc0 [[A:.*]] +; CHECK: {{.*}}: ; %b +; CHECK: buffer_load_dwordx2 +; CHECK-NEXT: s_setprio 0 +; CHECK: s_branch [[EXIT:.*]] +; CHECK: [[A]]: ; %a +; CHECK-NEXT: s_setprio 0 +; CHECK: s_branch [[EXIT]] +; CHECK-NEXT: [[EXIT]]: +define amdgpu_ps <2 x float> @branch(<4 x i32> inreg %p, i32 inreg %i) { + %cond = icmp eq i32 %i, 0 + br i1 %cond, label %a, label %b + +a: + ret <2 x float> + +b: + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + ret <2 x float> %v +} + +; CHECK-LABEL: setprio_follows_setprio: +; CHECK: s_setprio 3 +; CHECK: buffer_load_dwordx2 +; CHECK: s_cbranch_scc1 [[C:.*]] +; CHECK: {{.*}}: ; %a +; CHECK: buffer_load_dwordx2 +; CHECK-NEXT: s_setprio 0 +; CHECK: s_cbranch_scc1 [[C]] +; CHECK: {{.*}}: ; %b +; CHECK-NOT: s_setprio +; CHECK: s_branch [[EXIT:.*]] +; CHECK: [[C]]: ; %c +; CHECK-NEXT: s_setprio 0 +; CHECK: s_branch [[EXIT]] +; CHECK: [[EXIT]]: +define amdgpu_ps <2 x float> @setprio_follows_setprio(<4 x i32> inreg %p, i32 inreg %i) { +entry: + %v1 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + %cond1 = icmp ne i32 %i, 0 + br i1 %cond1, label %a, label %c + +a: + %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0) + %cond2 = icmp ne i32 %i, 1 + br i1 %cond2, label %b, label %c + +b: + ret <2 x float> %v2 + +c: + %v3 = phi <2 x float> [%v1, %entry], [%v2, %a] + %v4 = fadd <2 x float> %v1, %v3 + ret <2 x float> %v4 +} + +; CHECK-LABEL: loop: +; CHECK: {{.*}}: ; %entry +; CHECK: s_setprio 3 +; CHECK-NOT: s_setprio +; CHECK: [[LOOP:.*]]: ; %loop +; CHECK-NOT: s_setprio +; CHECK: buffer_load_dwordx2 +; CHECK-NOT: s_setprio +; CHECK: s_cbranch_scc1 [[LOOP]] +; CHECK-NEXT: {{.*}}: ; %exit +; CHECK-NEXT: s_setprio 0 +define amdgpu_ps <2 x float> @loop(<4 x i32> inreg %p) { +entry: + br label %loop + +loop: + %i = phi i32 [0, %entry], [%i2, %loop] + %sum = phi <2 x float> [, %entry], [%sum2, %loop] + + %i2 = add i32 %i, 1 + + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 %i, i32 0, i32 0, i32 0) + %sum2 = fadd <2 x float> %sum, %v + + %cond = icmp ult i32 %i2, 5 + br i1 %cond, label %loop, label %exit + +exit: + ret <2 x float> %sum2 +} + +; CHECK-LABEL: edge_split: +; CHECK: s_setprio 3 +; CHECK: buffer_load_dwordx2 +; CHECK-NOT: s_setprio +; CHECK: s_cbranch_scc1 [[ANOTHER_LOAD:.*]] +; CHECK: {{.*}}: ; %loop.preheader +; CHECK-NEXT: s_setprio 0 +; CHECK: [[LOOP:.*]]: ; %loop +; CHECK-NOT: s_setprio +; CHECK: s_cbranch_scc1 [[LOOP]] +; CHECK {{.*}}: ; %exit +; CHECK-NOT: s_setprio +; CHECK: s_branch [[RET:.*]] +; CHECK: [[ANOTHER_LOAD]]: ; %another_load +; CHECK: buffer_load_dwordx2 +; CHECK-NEXT: s_setprio 0 +; CHECK: s_branch [[RET]] +; CHECK: [[RET]]: +define amdgpu_ps <2 x float> @edge_split(<4 x i32> inreg %p, i32 inreg %x) { +entry: + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + %cond = icmp ne i32 %x, 0 + br i1 %cond, label %loop, label %another_load + +loop: + %i = phi i32 [0, %entry], [%i2, %loop] + %mul = phi <2 x float> [%v, %entry], [%mul2, %loop] + + %i2 = add i32 %i, 1 + %mul2 = fmul <2 x float> %mul, %v + + %cond2 = icmp ult i32 %i2, 5 + br i1 %cond2, label %loop, label %exit + +exit: + ret <2 x float> %mul2 + +another_load: + %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0) + %sum = fadd <2 x float> %v, %v2 + ret <2 x float> %sum +} + +declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) nounwind -- 2.7.4