From 6ddf2a824da97c81d7963c082c62640e8173b5b0 Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Wed, 27 Apr 2022 14:07:52 +0100
Subject: [PATCH] [AMDGPU] Adjust wave priority based on VMEM instructions to
 avoid duty-cycling.

As older waves execute long sequences of VALU instructions, this may
prevent younger waves from address calculation and then issuing their
VMEM loads, which in turn leads the VALU unit to idle. This patch tries
to prevent this by temporarily raising the wave's priority.

Reviewed By: foad

Differential Revision: https://reviews.llvm.org/D124246
---
 llvm/lib/Target/AMDGPU/AMDGPU.h                  |   3 +
 llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp | 166 +++++++++++++++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp   |   6 +
 llvm/lib/Target/AMDGPU/CMakeLists.txt            |   1 +
 llvm/test/CodeGen/AMDGPU/set-wave-priority.ll    | 153 +++++++++++++++++++++
 5 files changed, 329 insertions(+)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
 create mode 100644 llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 8926380..33f59ad 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -331,6 +331,9 @@ extern char &GCNNSAReassignID;
 void initializeGCNPreRAOptimizationsPass(PassRegistry &);
 extern char &GCNPreRAOptimizationsID;
 
+FunctionPass *createAMDGPUSetWavePriorityPass();
+void initializeAMDGPUSetWavePriorityPass(PassRegistry &);
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
new file mode 100644
index 0000000..34702ee
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
@@ -0,0 +1,166 @@
+//===- AMDGPUSetWavePriority.cpp - Set wave priority ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Pass to temporarily raise the wave priority beginning the start of
+/// the shader function until its last VMEM instructions to allow younger
+/// waves to issue their VMEM instructions as well.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Allocator.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-set-wave-priority"
+
+namespace {
+
+struct MBBInfo {
+  MBBInfo() = default;
+  bool MayReachVMEMLoad = false;
+};
+
+using MBBInfoSet = DenseMap<const MachineBasicBlock *, MBBInfo>;
+
+class AMDGPUSetWavePriority : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUSetWavePriority() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "Set wave priority"; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  MachineInstr *BuildSetprioMI(MachineFunction &MF, unsigned priority) const;
+
+  const SIInstrInfo *TII;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(AMDGPUSetWavePriority, DEBUG_TYPE, "Set wave priority", false,
+                false)
+
+char AMDGPUSetWavePriority::ID = 0;
+
+FunctionPass *llvm::createAMDGPUSetWavePriorityPass() {
+  return new AMDGPUSetWavePriority();
+}
+
+MachineInstr *AMDGPUSetWavePriority::BuildSetprioMI(MachineFunction &MF,
+                                                    unsigned priority) const {
+  return BuildMI(MF, DebugLoc(), TII->get(AMDGPU::S_SETPRIO)).addImm(priority);
+}
+
+// Checks that for every predecessor Pred that can reach a VMEM load,
+// none of Pred's successors can reach a VMEM load.
+static bool CanLowerPriorityDirectlyInPredecessors(const MachineBasicBlock &MBB,
+                                                   MBBInfoSet &MBBInfos) {
+  for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+    if (!MBBInfos[Pred].MayReachVMEMLoad)
+      continue;
+    for (const MachineBasicBlock *Succ : Pred->successors()) {
+      if (MBBInfos[Succ].MayReachVMEMLoad)
+        return false;
+    }
+  }
+  return true;
+}
+
+static bool isVMEMLoad(const MachineInstr &MI) {
+  return SIInstrInfo::isVMEM(MI) && MI.mayLoad();
+}
+
+bool AMDGPUSetWavePriority::runOnMachineFunction(MachineFunction &MF) {
+  const unsigned HighPriority = 3;
+  const unsigned LowPriority = 0;
+
+  Function &F = MF.getFunction();
+  if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+    return false;
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
+
+  MBBInfoSet MBBInfos;
+  SmallVector<const MachineBasicBlock *, 16> Worklist;
+  for (MachineBasicBlock &MBB : MF) {
+    if (any_of(MBB, isVMEMLoad))
+      Worklist.push_back(&MBB);
+  }
+
+  // Mark blocks from which control may reach VMEM loads.
+  while (!Worklist.empty()) {
+    const MachineBasicBlock *MBB = Worklist.pop_back_val();
+    MBBInfo &Info = MBBInfos[MBB];
+    if (!Info.MayReachVMEMLoad) {
+      Info.MayReachVMEMLoad = true;
+      Worklist.append(MBB->pred_begin(), MBB->pred_end());
+    }
+  }
+
+  MachineBasicBlock &Entry = MF.front();
+  if (!MBBInfos[&Entry].MayReachVMEMLoad)
+    return false;
+
+  // Raise the priority at the beginning of the shader.
+  MachineBasicBlock::iterator I = Entry.begin(), E = Entry.end();
+  while (I != E && !SIInstrInfo::isVALU(*I) && !I->isTerminator())
+    ++I;
+  Entry.insert(I, BuildSetprioMI(MF, HighPriority));
+
+  // Lower the priority on edges where control leaves blocks from which
+  // VMEM loads are reachable.
+  SmallSet<MachineBasicBlock *, 16> PriorityLoweringBlocks;
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBBInfos[&MBB].MayReachVMEMLoad) {
+      if (MBB.succ_empty())
+        PriorityLoweringBlocks.insert(&MBB);
+      continue;
+    }
+
+    if (CanLowerPriorityDirectlyInPredecessors(MBB, MBBInfos)) {
+      for (MachineBasicBlock *Pred : MBB.predecessors()) {
+        if (MBBInfos[Pred].MayReachVMEMLoad)
+          PriorityLoweringBlocks.insert(Pred);
+      }
+      continue;
+    }
+
+    // Where lowering the priority in predecessors is not possible, the
+    // block receiving control either was not part of a loop in the first
+    // place or the loop simplification/canonicalization pass should have
+    // already tried to split the edge and insert a preheader, and if for
+    // whatever reason it failed to do so, then this leaves us with the
+    // only option of lowering the priority within the loop.
+    PriorityLoweringBlocks.insert(&MBB);
+  }
+
+  for (MachineBasicBlock *MBB : PriorityLoweringBlocks) {
+    MachineBasicBlock::iterator I = MBB->end(), B = MBB->begin();
+    while (I != B) {
+      if (isVMEMLoad(*--I)) {
+        ++I;
+        break;
+      }
+    }
+    MBB->insert(I, BuildSetprioMI(MF, LowPriority));
+  }
+
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 8db5d63..afc8fd3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -277,6 +277,10 @@ EnableDCEInRA("amdgpu-dce-in-ra",
     cl::init(true), cl::Hidden,
     cl::desc("Enable machine DCE inside regalloc"));
 
+static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
+                                           cl::desc("Adjust wave priority"),
+                                           cl::init(false), cl::Hidden);
+
 static cl::opt<bool> EnableScalarIRPasses(
   "amdgpu-scalar-ir-passes",
   cl::desc("Enable scalar IR passes"),
@@ -1360,6 +1364,8 @@ void GCNPassConfig::addPreEmitPass() {
     addPass(&SIInsertHardClausesID);
 
   addPass(&SILateBranchLoweringPassID);
+  if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less))
+    addPass(createAMDGPUSetWavePriorityPass());
   if (getOptLevel() > CodeGenOpt::None)
     addPass(&SIPreEmitPeepholeID);
   // The hazard recognizer that runs as part of the post-ra scheduler does not
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 586c707..39685c3 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -89,6 +89,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUReplaceLDSUseWithPointer.cpp
   AMDGPUResourceUsageAnalysis.cpp
   AMDGPURewriteOutArguments.cpp
+  AMDGPUSetWavePriority.cpp
   AMDGPUSubtarget.cpp
   AMDGPUTargetMachine.cpp
   AMDGPUTargetObjectFile.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
new file mode 100644
index 0000000..ed7dc62
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
@@ -0,0 +1,153 @@
+; RUN: llc -mtriple=amdgcn -amdgpu-set-wave-priority=true -o - %s | \
+; RUN:   FileCheck %s
+
+; CHECK-LABEL: no_setprio:
+; CHECK-NOT:       s_setprio
+; CHECK:           ; return to shader part epilog
+define amdgpu_ps <2 x float> @no_setprio() {
+  ret <2 x float> <float 0.0, float 0.0>
+}
+
+; CHECK-LABEL: vmem_in_exit_block:
+; CHECK:           s_setprio 3
+; CHECK:           buffer_load_dwordx2
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           ; return to shader part epilog
+define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p) {
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  ret <2 x float> %v
+}
+
+; CHECK-LABEL: branch:
+; CHECK:           s_setprio 3
+; CHECK:           s_cbranch_scc0 [[A:.*]]
+; CHECK:       {{.*}}:  ; %b
+; CHECK:           buffer_load_dwordx2
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_branch [[EXIT:.*]]
+; CHECK:       [[A]]:  ; %a
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_branch [[EXIT]]
+; CHECK-NEXT:  [[EXIT]]:
+define amdgpu_ps <2 x float> @branch(<4 x i32> inreg %p, i32 inreg %i) {
+  %cond = icmp eq i32 %i, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  ret <2 x float> <float 0.0, float 0.0>
+
+b:
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  ret <2 x float> %v
+}
+
+; CHECK-LABEL: setprio_follows_setprio:
+; CHECK:           s_setprio 3
+; CHECK:           buffer_load_dwordx2
+; CHECK:           s_cbranch_scc1 [[C:.*]]
+; CHECK:       {{.*}}:  ; %a
+; CHECK:           buffer_load_dwordx2
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_cbranch_scc1 [[C]]
+; CHECK:       {{.*}}:  ; %b
+; CHECK-NOT:       s_setprio
+; CHECK:           s_branch [[EXIT:.*]]
+; CHECK:       [[C]]:  ; %c
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_branch [[EXIT]]
+; CHECK:       [[EXIT]]:
+define amdgpu_ps <2 x float> @setprio_follows_setprio(<4 x i32> inreg %p, i32 inreg %i) {
+entry:
+  %v1 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  %cond1 = icmp ne i32 %i, 0
+  br i1 %cond1, label %a, label %c
+
+a:
+  %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0)
+  %cond2 = icmp ne i32 %i, 1
+  br i1 %cond2, label %b, label %c
+
+b:
+  ret <2 x float> %v2
+
+c:
+  %v3 = phi <2 x float> [%v1, %entry], [%v2, %a]
+  %v4 = fadd <2 x float> %v1, %v3
+  ret <2 x float> %v4
+}
+
+; CHECK-LABEL: loop:
+; CHECK:       {{.*}}:  ; %entry
+; CHECK:           s_setprio 3
+; CHECK-NOT:       s_setprio
+; CHECK:       [[LOOP:.*]]:  ; %loop
+; CHECK-NOT:       s_setprio
+; CHECK:           buffer_load_dwordx2
+; CHECK-NOT:       s_setprio
+; CHECK:           s_cbranch_scc1 [[LOOP]]
+; CHECK-NEXT:  {{.*}}:  ; %exit
+; CHECK-NEXT:      s_setprio 0
+define amdgpu_ps <2 x float> @loop(<4 x i32> inreg %p) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [0, %entry], [%i2, %loop]
+  %sum = phi <2 x float> [<float 0.0, float 0.0>, %entry], [%sum2, %loop]
+
+  %i2 = add i32 %i, 1
+
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 %i, i32 0, i32 0, i32 0)
+  %sum2 = fadd <2 x float> %sum, %v
+
+  %cond = icmp ult i32 %i2, 5
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret <2 x float> %sum2
+}
+
+; CHECK-LABEL: edge_split:
+; CHECK:           s_setprio 3
+; CHECK:           buffer_load_dwordx2
+; CHECK-NOT:       s_setprio
+; CHECK:           s_cbranch_scc1 [[ANOTHER_LOAD:.*]]
+; CHECK:       {{.*}}:  ; %loop.preheader
+; CHECK-NEXT:      s_setprio 0
+; CHECK:       [[LOOP:.*]]:  ; %loop
+; CHECK-NOT:       s_setprio
+; CHECK:           s_cbranch_scc1 [[LOOP]]
+; CHECK        {{.*}}:  ; %exit
+; CHECK-NOT:       s_setprio
+; CHECK:           s_branch [[RET:.*]]
+; CHECK:       [[ANOTHER_LOAD]]:  ; %another_load
+; CHECK:           buffer_load_dwordx2
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_branch [[RET]]
+; CHECK:       [[RET]]:
+define amdgpu_ps <2 x float> @edge_split(<4 x i32> inreg %p, i32 inreg %x) {
+entry:
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  %cond = icmp ne i32 %x, 0
+  br i1 %cond, label %loop, label %another_load
+
+loop:
+  %i = phi i32 [0, %entry], [%i2, %loop]
+  %mul = phi <2 x float> [%v, %entry], [%mul2, %loop]
+
+  %i2 = add i32 %i, 1
+  %mul2 = fmul <2 x float> %mul, %v
+
+  %cond2 = icmp ult i32 %i2, 5
+  br i1 %cond2, label %loop, label %exit
+
+exit:
+  ret <2 x float> %mul2
+
+another_load:
+  %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0)
+  %sum = fadd <2 x float> %v, %v2
+  ret <2 x float> %sum
+}
+
+declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) nounwind
-- 
2.7.4