[AMDGPU] Add more expressive sched_barrier controls

author Austin Kerbow <Austin.Kerbow@amd.com>

Fri, 3 Jun 2022 18:35:47 +0000 (11:35 -0700)

committer Austin Kerbow <Austin.Kerbow@amd.com>

Wed, 15 Jun 2022 05:03:05 +0000 (22:03 -0700)
author Austin Kerbow <Austin.Kerbow@amd.com>
Fri, 3 Jun 2022 18:35:47 +0000 (11:35 -0700)
committer Austin Kerbow <Austin.Kerbow@amd.com>
Wed, 15 Jun 2022 05:03:05 +0000 (22:03 -0700)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td

index 9496163..24a6430 100644 (file)
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -220,12 +220,21 @@ def int_amdgcn_wave_barrier : GCCBuiltin<"__builtin_amdgcn_wave_barrier">,
  
  // The 1st parameter is a mask for the types of instructions that may be allowed
  // to cross the SCHED_BARRIER during scheduling.
-//     MASK = 0: No instructions may be scheduled across SCHED_BARRIER.
-//     MASK = 1: Non-memory, non-side-effect producing instructions may be
-//               scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass.
+//     MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER.
+//     MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be
+//                         scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass.
+//     MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER.
+//     MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER.
+//     MASK = 0x0000 0008: MFMA instructions may be scheduled across SCHED_BARRIER.
+//     MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER.
+//     MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER.
+//     MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER.
+//     MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER.
+//     MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER.
+//     MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER.
  def int_amdgcn_sched_barrier : GCCBuiltin<"__builtin_amdgcn_sched_barrier">,
-  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
-                                IntrHasSideEffects, IntrConvergent, IntrWillReturn]>;
+  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
+                                IntrWillReturn]>;
  
  def int_amdgcn_s_waitcnt : GCCBuiltin<"__builtin_amdgcn_s_waitcnt">,
    Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

new file mode 100644 (file)

index 0000000..2b4ce7a
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -0,0 +1,439 @@
+//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP  ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file This file defines a set of schedule DAG mutations that can be used to
+// override default scheduler behavior to enforce specific scheduling patterns.
+// They should be used in cases where runtime performance considerations such as
+// inter-wavefront interactions, mean that compile-time heuristics cannot
+// predict the optimal instruction ordering, or in kernels where optimum
+// instruction scheduling is important enough to warrant manual intervention.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUIGroupLP.h"
+#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-scheduler"
+
+namespace {
+
+static cl::opt<bool>
+    EnableIGroupLP("amdgpu-igrouplp",
+                   cl::desc("Enable construction of Instruction Groups and "
+                            "their ordering for scheduling"),
+                   cl::init(false));
+
+static cl::opt<Optional<unsigned>>
+    VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None),
+                     cl::Hidden,
+                     cl::desc("The maximum number of instructions to include "
+                              "in VMEM group."));
+
+static cl::opt<Optional<unsigned>>
+    MFMAGroupMaxSize("amdgpu-igrouplp-mfma-group-size", cl::init(None),
+                     cl::Hidden,
+                     cl::desc("The maximum number of instructions to include "
+                              "in MFMA group."));
+
+static cl::opt<Optional<unsigned>>
+    LDRGroupMaxSize("amdgpu-igrouplp-ldr-group-size", cl::init(None),
+                    cl::Hidden,
+                    cl::desc("The maximum number of instructions to include "
+                             "in lds/gds read group."));
+
+static cl::opt<Optional<unsigned>>
+    LDWGroupMaxSize("amdgpu-igrouplp-ldw-group-size", cl::init(None),
+                    cl::Hidden,
+                    cl::desc("The maximum number of instructions to include "
+                             "in lds/gds write group."));
+
+typedef function_ref<bool(const MachineInstr &, const SIInstrInfo *)>
+    CanAddMIFn;
+
+// Classify instructions into groups to enable fine tuned control over the
+// scheduler. These groups may be more specific than current SchedModel
+// instruction classes.
+class SchedGroup {
+private:
+  // Function that returns true if a non-bundle MI may be inserted into this
+  // group.
+  const CanAddMIFn canAddMI;
+
+  // Maximum number of SUnits that can be added to this group.
+  Optional<unsigned> MaxSize;
+
+  // Collection of SUnits that are classified as members of this group.
+  SmallVector<SUnit *, 32> Collection;
+
+  ScheduleDAGInstrs *DAG;
+
+  void tryAddEdge(SUnit *A, SUnit *B) {
+    if (A != B && DAG->canAddEdge(B, A)) {
+      DAG->addEdge(B, SDep(A, SDep::Artificial));
+      LLVM_DEBUG(dbgs() << "Adding edge...\n"
+                        << "from: SU(" << A->NodeNum << ") " << *A->getInstr()
+                        << "to: SU(" << B->NodeNum << ") " << *B->getInstr());
+    }
+  }
+
+public:
+  // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
+  // MakePred is true, SU will be a predecessor of the SUnits in this
+  // SchedGroup, otherwise SU will be a successor.
+  void link(SUnit &SU, bool MakePred = false) {
+    for (auto A : Collection) {
+      SUnit *B = &SU;
+      if (MakePred)
+        std::swap(A, B);
+
+      tryAddEdge(A, B);
+    }
+  }
+
+  // Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use
+  // the predicate to determine whether SU should be a predecessor (P = true)
+  // or a successor (P = false) of this SchedGroup.
+  void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P) {
+    for (auto A : Collection) {
+      SUnit *B = &SU;
+      if (P(A, B))
+        std::swap(A, B);
+
+      tryAddEdge(A, B);
+    }
+  }
+
+  // Add DAG dependencies such that SUnits in this group shall be ordered
+  // before SUnits in OtherGroup.
+  void link(SchedGroup &OtherGroup) {
+    for (auto B : OtherGroup.Collection)
+      link(*B);
+  }
+
+  // Returns true if no more instructions may be added to this group.
+  bool isFull() { return MaxSize.hasValue() && Collection.size() >= *MaxSize; }
+
+  // Returns true if SU can be added to this SchedGroup.
+  bool canAddSU(SUnit &SU, const SIInstrInfo *TII) {
+    if (isFull())
+      return false;
+
+    MachineInstr &MI = *SU.getInstr();
+    if (MI.getOpcode() != TargetOpcode::BUNDLE)
+      return canAddMI(MI, TII);
+
+    // Special case for bundled MIs.
+    const MachineBasicBlock *MBB = MI.getParent();
+    MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
+    while (E != MBB->end() && E->isBundledWithPred())
+      ++E;
+
+    // Return true if all of the bundled MIs can be added to this group.
+    return std::all_of(
+        B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); });
+  }
+
+  void add(SUnit &SU) { Collection.push_back(&SU); }
+
+  SchedGroup(CanAddMIFn canAddMI, Optional<unsigned> MaxSize,
+             ScheduleDAGInstrs *DAG)
+      : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {}
+};
+
+bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return TII->isMFMA(MI);
+}
+
+bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return TII->isVALU(MI) && !TII->isMFMA(MI);
+}
+
+bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return TII->isSALU(MI);
+}
+
+bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI));
+}
+
+bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return MI.mayLoad() &&
+         (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
+}
+
+bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return MI.mayStore() &&
+         (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
+}
+
+bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return MI.mayStore() && TII->isDS(MI);
+}
+
+bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return MI.mayLoad() && TII->isDS(MI);
+}
+
+class IGroupLPDAGMutation : public ScheduleDAGMutation {
+public:
+  const SIInstrInfo *TII;
+  ScheduleDAGMI *DAG;
+
+  IGroupLPDAGMutation() = default;
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+};
+
+// DAG mutation that coordinates with the SCHED_BARRIER instruction and
+// corresponding builtin. The mutation adds edges from specific instruction
+// classes determined by the SCHED_BARRIER mask so that they cannot be
+// scheduled around the SCHED_BARRIER.
+class SchedBarrierDAGMutation : public ScheduleDAGMutation {
+private:
+  const SIInstrInfo *TII;
+
+  ScheduleDAGMI *DAG;
+
+  // Components of the mask that determines which instructions may not be
+  // scheduled across the SCHED_BARRIER.
+  enum class SchedBarrierMasks {
+    NONE = 0u,
+    ALU = 1u << 0,
+    VALU = 1u << 1,
+    SALU = 1u << 2,
+    MFMA = 1u << 3,
+    VMEM = 1u << 4,
+    VMEM_READ = 1u << 5,
+    VMEM_WRITE = 1u << 6,
+    DS = 1u << 7,
+    DS_READ = 1u << 8,
+    DS_WRITE = 1u << 9,
+    LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE)
+  };
+
+  // Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a
+  // region.
+  //
+  std::unique_ptr<SchedGroup> MFMASchedGroup = nullptr;
+  std::unique_ptr<SchedGroup> VALUSchedGroup = nullptr;
+  std::unique_ptr<SchedGroup> SALUSchedGroup = nullptr;
+  std::unique_ptr<SchedGroup> VMEMReadSchedGroup = nullptr;
+  std::unique_ptr<SchedGroup> VMEMWriteSchedGroup = nullptr;
+  std::unique_ptr<SchedGroup> DSWriteSchedGroup = nullptr;
+  std::unique_ptr<SchedGroup> DSReadSchedGroup = nullptr;
+
+  // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
+  // not be reordered accross the SCHED_BARRIER.
+  void getSchedGroupsFromMask(int32_t Mask,
+                              SmallVectorImpl<SchedGroup *> &SchedGroups);
+
+  // Add DAG edges that enforce SCHED_BARRIER ordering.
+  void addSchedBarrierEdges(SUnit &SU);
+
+  // Classify instructions and add them to the SchedGroup.
+  void initSchedGroup(SchedGroup *SG);
+
+  // Remove all existing edges from a SCHED_BARRIER.
+  void resetSchedBarrierEdges(SUnit &SU);
+
+public:
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+
+  SchedBarrierDAGMutation() = default;
+};
+
+void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
+  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
+  DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
+  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
+  if (!TSchedModel || DAG->SUnits.empty())
+    return;
+
+  LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
+
+  // The order of InstructionGroups in this vector defines the
+  // order in which edges will be added. In other words, given the
+  // present ordering, we will try to make each VMEMRead instruction
+  // a predecessor of each DSRead instruction, and so on.
+  SmallVector<SchedGroup, 4> PipelineOrderGroups = {
+      SchedGroup(&isVMEMSGMember, VMEMGroupMaxSize, DAG),
+      SchedGroup(&isDSReadSGMember, LDRGroupMaxSize, DAG),
+      SchedGroup(&isMFMASGMember, MFMAGroupMaxSize, DAG),
+      SchedGroup(&isDSWriteSGMember, LDWGroupMaxSize, DAG)};
+
+  for (SUnit &SU : DAG->SUnits) {
+    LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU));
+    for (auto &SG : PipelineOrderGroups)
+      if (SG.canAddSU(SU, TII))
+        SG.add(SU);
+  }
+
+  for (unsigned i = 0; i < PipelineOrderGroups.size() - 1; i++) {
+    auto &GroupA = PipelineOrderGroups[i];
+    for (unsigned j = i + 1; j < PipelineOrderGroups.size(); j++) {
+      auto &GroupB = PipelineOrderGroups[j];
+      GroupA.link(GroupB);
+    }
+  }
+}
+
+void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
+  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
+  if (!TSchedModel || DAGInstrs->SUnits.empty())
+    return;
+
+  LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n");
+
+  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
+  DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
+  for (auto &SU : DAG->SUnits)
+    if (SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER)
+      addSchedBarrierEdges(SU);
+}
+
+void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
+  MachineInstr &MI = *SchedBarrier.getInstr();
+  assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
+  // Remove all existing edges from the SCHED_BARRIER that were added due to the
+  // instruction having side effects.
+  resetSchedBarrierEdges(SchedBarrier);
+  SmallVector<SchedGroup *, 4> SchedGroups;
+  int32_t Mask = MI.getOperand(0).getImm();
+  getSchedGroupsFromMask(Mask, SchedGroups);
+  for (auto SG : SchedGroups)
+    SG->link(
+        SchedBarrier, (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
+                          const SUnit *A, const SUnit *B) {
+          return A->NodeNum > B->NodeNum;
+        });
+}
+
+void SchedBarrierDAGMutation::getSchedGroupsFromMask(
+    int32_t Mask, SmallVectorImpl<SchedGroup *> &SchedGroups) {
+  SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask;
+  // See IntrinsicsAMDGPU.td for an explanation of these masks and their
+  // mappings.
+  //
+  if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE &&
+      (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
+    if (!VALUSchedGroup) {
+      VALUSchedGroup = std::make_unique<SchedGroup>(isVALUSGMember, None, DAG);
+      initSchedGroup(VALUSchedGroup.get());
+    }
+
+    SchedGroups.push_back(VALUSchedGroup.get());
+  }
+
+  if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE &&
+      (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
+    if (!SALUSchedGroup) {
+      SALUSchedGroup = std::make_unique<SchedGroup>(isSALUSGMember, None, DAG);
+      initSchedGroup(SALUSchedGroup.get());
+    }
+
+    SchedGroups.push_back(SALUSchedGroup.get());
+  }
+
+  if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE &&
+      (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
+    if (!MFMASchedGroup) {
+      MFMASchedGroup = std::make_unique<SchedGroup>(isMFMASGMember, None, DAG);
+      initSchedGroup(MFMASchedGroup.get());
+    }
+
+    SchedGroups.push_back(MFMASchedGroup.get());
+  }
+
+  if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE &&
+      (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
+    if (!VMEMReadSchedGroup) {
+      VMEMReadSchedGroup =
+          std::make_unique<SchedGroup>(isVMEMReadSGMember, None, DAG);
+      initSchedGroup(VMEMReadSchedGroup.get());
+    }
+
+    SchedGroups.push_back(VMEMReadSchedGroup.get());
+  }
+
+  if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE &&
+      (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
+    if (!VMEMWriteSchedGroup) {
+      VMEMWriteSchedGroup =
+          std::make_unique<SchedGroup>(isVMEMWriteSGMember, None, DAG);
+      initSchedGroup(VMEMWriteSchedGroup.get());
+    }
+
+    SchedGroups.push_back(VMEMWriteSchedGroup.get());
+  }
+
+  if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE &&
+      (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
+    if (!DSReadSchedGroup) {
+      DSReadSchedGroup =
+          std::make_unique<SchedGroup>(isDSReadSGMember, None, DAG);
+      initSchedGroup(DSReadSchedGroup.get());
+    }
+
+    SchedGroups.push_back(DSReadSchedGroup.get());
+  }
+
+  if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE &&
+      (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
+    if (!DSWriteSchedGroup) {
+      DSWriteSchedGroup =
+          std::make_unique<SchedGroup>(isDSWriteSGMember, None, DAG);
+      initSchedGroup(DSWriteSchedGroup.get());
+    }
+
+    SchedGroups.push_back(DSWriteSchedGroup.get());
+  }
+}
+
+void SchedBarrierDAGMutation::initSchedGroup(SchedGroup *SG) {
+  assert(SG);
+  for (auto &SU : DAG->SUnits)
+    if (SG->canAddSU(SU, TII))
+      SG->add(SU);
+}
+
+void SchedBarrierDAGMutation::resetSchedBarrierEdges(SUnit &SU) {
+  assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER);
+  for (auto &P : SU.Preds)
+    SU.removePred(P);
+
+  for (auto &S : SU.Succs) {
+    for (auto &SP : S.getSUnit()->Preds) {
+      if (SP.getSUnit() == &SU) {
+        S.getSUnit()->removePred(SP);
+      }
+    }
+  }
+}
+
+} // namespace
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
+  return EnableIGroupLP ? std::make_unique<IGroupLPDAGMutation>() : nullptr;
+}
+
+std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation() {
+  return std::make_unique<SchedBarrierDAGMutation>();
+}
+
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h

similarity index 82%

rename from llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.h

rename to llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h

index 7a83093..aeb1bba 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -14,7 +14,8 @@
  
  namespace llvm {
  
-std::unique_ptr<ScheduleDAGMutation> createMFMAIGroupLPDAGMutation();
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation();
+std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation();
  
  } // namespace llvm
  
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.cpp

deleted file mode 100644 (file)

index c4b1918..0000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-//===--- AMDGPUMFMAIGroupLP.cpp - AMDGPU MFMA IGroupLP  ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// \file This file contains a DAG scheduling mutation which tries to coerce
-//       the scheduler into generating an ordering based on ordering of groups
-//       of instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUMFMAIGroupLP.h"
-#include "AMDGPUTargetMachine.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/CodeGen/TargetOpcodes.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "amdgpu-MFMA-IGroupLP"
-
-namespace {
-
-static cl::opt<bool>
-    EnableMFMAIGroupLP("amdgpu-mfma-igrouplp",
-                       cl::desc("Enable construction of Instruction Groups and "
-                                "their ordering for scheduling"),
-                       cl::init(false));
-
-static cl::opt<int>
-    VMEMGroupMaxSize("amdgpu-mfma-igrouplp-vmem-group-size", cl::init(-1),
-                     cl::Hidden,
-                     cl::desc("The maximum number of instructions to include "
-                              "in VMEM group."));
-
-static cl::opt<int>
-    MFMAGroupMaxSize("amdgpu-mfma-igrouplp-mfma-group-size", cl::init(-1),
-                     cl::Hidden,
-                     cl::desc("The maximum number of instructions to include "
-                              "in MFMA group."));
-
-static cl::opt<int>
-    LDRGroupMaxSize("amdgpu-mfma-igrouplp-ldr-group-size", cl::init(-1),
-                    cl::Hidden,
-                    cl::desc("The maximum number of instructions to include "
-                             "in lds/gds read group."));
-
-static cl::opt<int>
-    LDWGroupMaxSize("amdgpu-mfma-igrouplp-ldw-group-size", cl::init(-1),
-                    cl::Hidden,
-                    cl::desc("The maximum number of instructions to include "
-                             "in lds/gds write group."));
-
-typedef function_ref<bool(const MachineInstr &)> IsInstructionType;
-
-struct InstructionClass {
-  SmallVector<SUnit *, 32> Collection;
-  const IsInstructionType isInstructionClass;
-  // MaxSize is initialized to -1 by default, if MaxSize is < 0, then
-  // the collection will not have a size limit
-  const int MaxSize;
-
-  InstructionClass(IsInstructionType IsInstructionClass, int maxSize)
-      : isInstructionClass(IsInstructionClass), MaxSize(maxSize){};
-
-  bool IsFull() { return !(MaxSize <= 0) && (int)Collection.size() >= MaxSize; }
-};
-
-class MFMAIGroupLPDAGMutation : public ScheduleDAGMutation {
-public:
-  const SIInstrInfo *TII;
-  ScheduleDAGMI *DAG;
-
-  MFMAIGroupLPDAGMutation() = default;
-  void apply(ScheduleDAGInstrs *DAGInstrs) override;
-};
-
-static void collectSUnits(SmallVectorImpl<InstructionClass *> &PipelineOrder,
-                          const SIInstrInfo *TII, ScheduleDAGInstrs *DAG) {
-  for (SUnit &SU : DAG->SUnits) {
-    LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU));
-
-    // Presently, a bundle only counts as one instruction towards
-    // the group's maximum size
-    if (SU.getInstr()->getOpcode() == TargetOpcode::BUNDLE) {
-      MachineInstr *MI = SU.getInstr();
-      MachineBasicBlock::instr_iterator BundledMI = MI->getIterator();
-      ++BundledMI;
-
-      LLVM_DEBUG(dbgs() << "Checking bundled insts\n";);
-
-      InstructionClass *MatchingStage = nullptr;
-      for (auto Stage : PipelineOrder) {
-        if (Stage->isInstructionClass(*BundledMI) && !Stage->IsFull()) {
-          MatchingStage = Stage;
-          break;
-        }
-      }
-
-      if (MatchingStage != nullptr) {
-        while (MatchingStage->isInstructionClass(*BundledMI)) {
-          if (!BundledMI->isBundledWithSucc())
-            break;
-          ++BundledMI;
-        }
-
-        if (!BundledMI->isBundledWithSucc()) {
-          LLVM_DEBUG(dbgs() << "Bundle is all of same type\n";);
-          MatchingStage->Collection.push_back(&SU);
-        }
-      }
-    }
-
-    for (InstructionClass *Stage : PipelineOrder) {
-      if (Stage->isInstructionClass(*SU.getInstr()) && !Stage->IsFull()) {
-        Stage->Collection.push_back(&SU);
-      }
-    }
-  }
-}
-
-static void
-addPipelineEdges(const llvm::ArrayRef<InstructionClass *> PipelineOrder,
-                 ScheduleDAGInstrs *DAG) {
-  for (int i = 0; i < (int)PipelineOrder.size() - 1; i++) {
-    auto StageA = PipelineOrder[i];
-    for (int j = i + 1; j < (int)PipelineOrder.size(); j++) {
-      auto StageB = PipelineOrder[j];
-      for (auto SUnitA : StageA->Collection) {
-        LLVM_DEBUG(dbgs() << "Adding edges for: "; DAG->dumpNode(*SUnitA););
-        for (auto SUnitB : StageB->Collection) {
-          if (DAG->canAddEdge(SUnitB, SUnitA)) {
-            DAG->addEdge(SUnitB, SDep(SUnitA, SDep::Artificial));
-            LLVM_DEBUG(dbgs() << "Added edge to: "; DAG->dumpNode(*SUnitB););
-          } else {
-            LLVM_DEBUG(dbgs() << "Can't add edge to: ";
-                       DAG->dumpNode(*SUnitB););
-          }
-        }
-      }
-    }
-  }
-}
-
-void MFMAIGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
-  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
-  TII = ST.getInstrInfo();
-  if (!ST.hasMAIInsts())
-    return;
-  DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
-  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
-  if (!TSchedModel || DAG->SUnits.empty())
-    return;
-
-  const IsInstructionType isMFMAFn = [this](const MachineInstr &MI) {
-    if (TII->isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
-        MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64) {
-      LLVM_DEBUG(dbgs() << "Found MFMA\n";);
-      return true;
-    }
-    return false;
-  };
-  InstructionClass MFMASUnits(isMFMAFn, MFMAGroupMaxSize);
-
-  const IsInstructionType isVMEMReadFn = [this](const MachineInstr &MI) {
-    if (((TII->isFLAT(MI) && !TII->isDS(MI)) || TII->isVMEM(MI)) &&
-        MI.mayLoad()) {
-      LLVM_DEBUG(dbgs() << "Found VMEM read\n";);
-      return true;
-    }
-    return false;
-  };
-  InstructionClass VMEMReadSUnits(isVMEMReadFn, VMEMGroupMaxSize);
-
-  const IsInstructionType isDSWriteFn = [this](const MachineInstr &MI) {
-    if (TII->isDS(MI) && MI.mayStore()) {
-      LLVM_DEBUG(dbgs() << "Found DS Write\n";);
-      return true;
-    }
-    return false;
-  };
-  InstructionClass DSWriteSUnits(isDSWriteFn, LDWGroupMaxSize);
-
-  const IsInstructionType isDSReadFn = [this](const MachineInstr &MI) {
-    if (TII->isDS(MI) && MI.mayLoad()) {
-      LLVM_DEBUG(dbgs() << "Found DS Read\n";);
-      return true;
-    }
-    return false;
-  };
-  InstructionClass DSReadSUnits(isDSReadFn, LDRGroupMaxSize);
-
-  // The order of InstructionClasses in this vector defines the
-  // order in which edges will be added. In other words, given the
-  // present ordering, we will try to make each VMEMRead instruction
-  // a predecessor of each DSRead instruction, and so on.
-  SmallVector<InstructionClass *, 4> PipelineOrder = {
-      &VMEMReadSUnits, &DSReadSUnits, &MFMASUnits, &DSWriteSUnits};
-
-  collectSUnits(PipelineOrder, TII, DAG);
-
-  addPipelineEdges(PipelineOrder, DAG);
-}
-
-} // namespace
-
-namespace llvm {
-
-std::unique_ptr<ScheduleDAGMutation> createMFMAIGroupLPDAGMutation() {
-  return EnableMFMAIGroupLP ? std::make_unique<MFMAIGroupLPDAGMutation>()
-                            : nullptr;
-}
-
-} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

index d27190c..dc77b2c 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -16,7 +16,7 @@
  #include "AMDGPU.h"
  #include "AMDGPUAliasAnalysis.h"
  #include "AMDGPUExportClustering.h"
-#include "AMDGPUMFMAIGroupLP.h"
+#include "AMDGPUIGroupLP.h"
  #include "AMDGPUMacroFusion.h"
  #include "AMDGPUTargetObjectFile.h"
  #include "AMDGPUTargetTransformInfo.h"
@@ -399,7 +399,8 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
    ScheduleDAGMILive *DAG =
      new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
-  DAG->addMutation(createMFMAIGroupLPDAGMutation());
+  DAG->addMutation(createIGroupLPDAGMutation());
+  DAG->addMutation(createSchedBarrierDAGMutation());
    DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
    DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
    return DAG;
@@ -898,7 +899,8 @@ public:
      const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
      DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
      DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
-    DAG->addMutation(createMFMAIGroupLPDAGMutation());
+    DAG->addMutation(createIGroupLPDAGMutation());
+    DAG->addMutation(createSchedBarrierDAGMutation());
      return DAG;
    }
  
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt

index 267d3e6..50e82c1 100644 (file)
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -75,7 +75,7 @@ add_llvm_target(AMDGPUCodeGen
    AMDGPUMachineModuleInfo.cpp
    AMDGPUMacroFusion.cpp
    AMDGPUMCInstLower.cpp
-  AMDGPUMFMAIGroupLP.cpp
+  AMDGPUIGroupLP.cpp
    AMDGPUMIRFormatter.cpp
    AMDGPUOpenCLEnqueuedBlockLowering.cpp
    AMDGPUPerfHintAnalysis.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-igrouplp-dag-mutation.mir b/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir

similarity index 80%

rename from llvm/test/CodeGen/AMDGPU/mfma-igrouplp-dag-mutation.mir

rename to llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir

index f7139cc..4d8c011 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/mfma-igrouplp-dag-mutation.mir
+++ b/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir
@@ -1,6 +1,6 @@
  # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
  # RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - 2>&1 | FileCheck -check-prefix=DEFAULT %s
-# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-mfma-igrouplp=1 2>&1 | FileCheck -check-prefix=PIPELINE %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-igrouplp=1 2>&1 | FileCheck -check-prefix=PIPELINE %s
  
  ---
  name: no_pipeline
@@ -181,3 +181,41 @@ body:             |
      $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
      DS_WRITE_B32 $vgpr9, $vgpr24, 0, 16, implicit $m0, implicit $exec
  ...
+
+---
+name: block_ends_in_bundle
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3
+    ; DEFAULT-LABEL: name: block_ends_in_bundle
+    ; DEFAULT: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3
+    ; DEFAULT-NEXT: {{  $}}
+    ; DEFAULT-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec {
+    ; DEFAULT-NEXT:   $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
+    ; DEFAULT-NEXT:   $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
+    ; DEFAULT-NEXT:   $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
+    ; DEFAULT-NEXT:   $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
+    ; DEFAULT-NEXT:   $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec
+    ; DEFAULT-NEXT: }
+    ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; PIPELINE-LABEL: name: block_ends_in_bundle
+    ; PIPELINE: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3
+    ; PIPELINE-NEXT: {{  $}}
+    ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec {
+    ; PIPELINE-NEXT:   $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
+    ; PIPELINE-NEXT:   $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
+    ; PIPELINE-NEXT:   $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
+    ; PIPELINE-NEXT:   $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
+    ; PIPELINE-NEXT:   $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec
+    ; PIPELINE-NEXT: }
+    ; PIPELINE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+      $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+      BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec {
+        $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
+        $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
+        $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
+        $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
+        $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
+    }
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll

index f3baedc..766d504 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
@@ -1,8 +1,8 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
  ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
  
-define amdgpu_kernel void @test_wave_barrier() #0 {
-; GCN-LABEL: test_wave_barrier:
+define amdgpu_kernel void @test_sched_barrier() #0 {
+; GCN-LABEL: test_sched_barrier:
  ; GCN:       ; %bb.0: ; %entry
  ; GCN-NEXT:    ; sched_barrier mask(0x00000000)
  ; GCN-NEXT:    ; sched_barrier mask(0x00000001)
diff --git a/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir

new file mode 100644 (file)

index 0000000..56ab64e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir
@@ -0,0 +1,122 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -misched-cluster=false -run-pass=postmisched -verify-misched -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @no_sched_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_mask_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_mask_1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+
+  !0 = distinct !{!0}
+  !1 = !{!1, !0}
+...
+
+---
+name: no_sched_barrier
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: no_sched_barrier
+    ; CHECK: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+    ; CHECK-NEXT:   renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT:   renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+    ; CHECK-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 {
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: S_ENDPGM 0
+    renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    renamable $vgpr0 = IMPLICIT_DEF
+    BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+      renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+      renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    }
+    renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+    BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 {
+      GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+      GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    }
+    S_ENDPGM 0
+...
+
+# MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_0
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sched_barrier_mask_0
+    ; CHECK: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+    ; CHECK-NEXT:   renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT:   renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    ; CHECK-NEXT: SCHED_BARRIER 0
+    ; CHECK-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+    ; CHECK-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 {
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: S_ENDPGM 0
+    renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    renamable $vgpr0 = IMPLICIT_DEF
+    BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+      renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+      renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    }
+    renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    SCHED_BARRIER 0
+    renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+    BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 {
+      GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+      GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    }
+    S_ENDPGM 0
+...
+
+# MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be
+#                     scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass.
+
+---
+name: sched_barrier_mask_1
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sched_barrier_mask_1
+    ; CHECK: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+    ; CHECK-NEXT:   renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT:   renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SCHED_BARRIER 1
+    ; CHECK-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+    ; CHECK-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 {
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: S_ENDPGM 0
+    renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    renamable $vgpr0 = IMPLICIT_DEF
+    BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+      renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+      renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    }
+    renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    SCHED_BARRIER 1
+    renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+    BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 {
+      GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+      GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    }
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir

new file mode 100644 (file)

index 0000000..739a622
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir
@@ -0,0 +1,570 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -misched-cluster=false -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @no_sched_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_mask_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_mask_1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_mask_2(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_mask_4(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_mask_8(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_mask_16(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_mask_32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_mask_64(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_mask_128(i32 addrspace(3)* noalias %out, i32 addrspace(3)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_mask_256(i32 addrspace(3)* noalias %out, i32 addrspace(3)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_mask_512(i32 addrspace(3)* noalias %out, i32 addrspace(3)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_masks_8_12(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_mask_4_bundle(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_mask_0_bundle(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+
+  !0 = distinct !{!0}
+  !1 = !{!1, !0}
+...
+
+---
+name: no_sched_barrier
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: no_sched_barrier
+    ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+    ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_NOP 0
+    %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_ENDPGM 0
+...
+
+# MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_0
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sched_barrier_mask_0
+    ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: SCHED_BARRIER 0
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_NOP 0
+    SCHED_BARRIER 0
+    %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_ENDPGM 0
+...
+
+# MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be
+#                     scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass.
+
+---
+name: sched_barrier_mask_1
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sched_barrier_mask_1
+    ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: SCHED_BARRIER 1
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+    ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_NOP 0
+    SCHED_BARRIER 1
+    %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_ENDPGM 0
+...
+
+# MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_2
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sched_barrier_mask_2
+    ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+    ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: SCHED_BARRIER 2
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]]
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %3:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %2, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %1, implicit $exec
+    %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %1, implicit $exec
+    S_NOP 0
+    SCHED_BARRIER 2
+    %6:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %7:vgpr_32 = nsw V_MUL_LO_U32_e64 %6, %6, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %7, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_ENDPGM 0, implicit %4, implicit %5
+...
+
+# MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_4
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sched_barrier_mask_4
+    ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: SCHED_BARRIER 4
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+    ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]]
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:areg_128 = IMPLICIT_DEF
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+    %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+    S_NOP 0
+    %7:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %2, 0, 0, 0, implicit $mode, implicit $exec
+    %8:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %7, 0, 0, 0, implicit $mode, implicit $exec
+    %9:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %8, 0, 0, 0, implicit $mode, implicit $exec
+    %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %9, 0, 0, 0, implicit $mode, implicit $exec
+    %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %10, 0, 0, 0, implicit $mode, implicit $exec
+    SCHED_BARRIER 4
+    %12:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %13:vgpr_32 = nsw V_MUL_LO_U32_e64 %12, %12, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %13, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_ENDPGM 0, implicit %5, implicit %6, implicit %11
+...
+
+# MASK = 0x0000 0008: MFMA instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_8
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sched_barrier_mask_8
+    ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
+    ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: SCHED_BARRIER 8
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]]
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:areg_128 = IMPLICIT_DEF
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+    %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+    S_NOP 0
+    %7:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %2, 0, 0, 0, implicit $mode, implicit $exec
+    %8:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %7, 0, 0, 0, implicit $mode, implicit $exec
+    %9:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %8, 0, 0, 0, implicit $mode, implicit $exec
+    %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %9, 0, 0, 0, implicit $mode, implicit $exec
+    %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %10, 0, 0, 0, implicit $mode, implicit $exec
+    SCHED_BARRIER 8
+    %12:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %13:vgpr_32 = nsw V_MUL_LO_U32_e64 %12, %12, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %13, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_ENDPGM 0, implicit %5, implicit %6, implicit %11
+...
+
+# MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_16
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sched_barrier_mask_16
+    ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+    ; CHECK-NEXT: SCHED_BARRIER 16
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_NOP 0
+    SCHED_BARRIER 16
+    %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_ENDPGM 0
+...
+
+# MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_32
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sched_barrier_mask_32
+    ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+    ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: SCHED_BARRIER 32
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_NOP 0
+    SCHED_BARRIER 32
+    %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_ENDPGM 0
+...
+
+# MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_64
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sched_barrier_mask_64
+    ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+    ; CHECK-NEXT: SCHED_BARRIER 64
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_NOP 0
+    SCHED_BARRIER 64
+    %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_ENDPGM 0
+...
+
+# MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_128
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sched_barrier_mask_128
+    ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+    ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+    ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_]], [[DS_READ_U16_gfx9_]], implicit $exec
+    ; CHECK-NEXT: SCHED_BARRIER 128
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec
+    ; CHECK-NEXT: dead %0:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+    ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_1]], [[V_MUL_LO_U32_e64_]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+    %3:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %2, implicit $exec
+    DS_WRITE_B32 %3, %1, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+    S_NOP 0
+    SCHED_BARRIER 128
+    %4:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+    %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %4, %4, implicit $exec
+    DS_WRITE_B32 %5, %3, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+    S_ENDPGM 0
+...
+
+# MASK = 0x0000 0100: ALL DS read instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_256
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sched_barrier_mask_256
+    ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_]], [[DS_READ_U16_gfx9_]], implicit $exec
+    ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+    ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+    ; CHECK-NEXT: SCHED_BARRIER 256
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec
+    ; CHECK-NEXT: dead %0:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_1]], [[V_MUL_LO_U32_e64_]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+    %3:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %2, implicit $exec
+    DS_WRITE_B32 %3, %1, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+    S_NOP 0
+    SCHED_BARRIER 256
+    %4:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+    %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %4, %4, implicit $exec
+    DS_WRITE_B32 %5, %3, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+    S_ENDPGM 0
+...
+
+# MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_512
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sched_barrier_mask_512
+    ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+    ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_]], [[DS_READ_U16_gfx9_]], implicit $exec
+    ; CHECK-NEXT: SCHED_BARRIER 512
+    ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec
+    ; CHECK-NEXT: dead %0:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+    ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_1]], [[V_MUL_LO_U32_e64_]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+    %3:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %2, implicit $exec
+    DS_WRITE_B32 %3, %1, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+    S_NOP 0
+    SCHED_BARRIER 512
+    %4:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+    %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %4, %4, implicit $exec
+    DS_WRITE_B32 %5, %3, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+    S_ENDPGM 0
+...
+
+# MASK = 0x0000 0008: MFMA instructions may be scheduled across SCHED_BARRIER.
+# MASK = 0x0000 000C: MFMA and SALU may be scheduled across SCHED_BARRIER.
+# Check that S_NOP can move moved before the first SCHED_BARRIER but not the second.
+
+---
+name: sched_barrier_masks_8_12
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sched_barrier_masks_8_12
+    ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
+    ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: SCHED_BARRIER 12
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: SCHED_BARRIER 8
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]]
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:areg_128 = IMPLICIT_DEF
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+    %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+    SCHED_BARRIER 12
+    %7:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %2, 0, 0, 0, implicit $mode, implicit $exec
+    %8:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %7, 0, 0, 0, implicit $mode, implicit $exec
+    %9:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %8, 0, 0, 0, implicit $mode, implicit $exec
+    %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %9, 0, 0, 0, implicit $mode, implicit $exec
+    %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %10, 0, 0, 0, implicit $mode, implicit $exec
+    S_NOP 0
+    SCHED_BARRIER 8
+    S_NOP 0
+    %12:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %13:vgpr_32 = nsw V_MUL_LO_U32_e64 %12, %12, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR %1, %13, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_ENDPGM 0, implicit %5, implicit %6, implicit %11
+...
+
+# MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_4_bundle
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sched_barrier_mask_4_bundle
+    ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+    ; CHECK-NEXT: BUNDLE [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[GLOBAL_LOAD_DWORD_SADDR1]] {
+    ; CHECK-NEXT:   S_NOP 0
+    ; CHECK-NEXT:   S_NOP 0
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SCHED_BARRIER 4
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+    %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+    SCHED_BARRIER 4
+    BUNDLE implicit %3, %5 {
+      S_NOP 0
+      S_NOP 0
+    }
+    GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_ENDPGM 0
+...
+
+# MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_0_bundle
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sched_barrier_mask_0_bundle
+    ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+    ; CHECK-NEXT: SCHED_BARRIER 0
+    ; CHECK-NEXT: BUNDLE [[GLOBAL_LOAD_DWORD_SADDR1]], implicit [[GLOBAL_LOAD_DWORD_SADDR]] {
+    ; CHECK-NEXT:   S_NOP 0
+    ; CHECK-NEXT:   S_NOP 0
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+    %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+    SCHED_BARRIER 0
+    BUNDLE implicit %3, %5 {
+      S_NOP 0
+      S_NOP 0
+    }
+    GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/sched_barrier.mir b/llvm/test/CodeGen/AMDGPU/sched_barrier.mir

deleted file mode 100644 (file)

index 0853ea7..0000000
--- a/llvm/test/CodeGen/AMDGPU/sched_barrier.mir
+++ /dev/null
@@ -1,99 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
-
---- |
-  define amdgpu_kernel void @no_sched_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
-  define amdgpu_kernel void @sched_barrier_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
-  define amdgpu_kernel void @sched_barrier_1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
-
-  !0 = distinct !{!0}
-  !1 = !{!1, !0}
-...
-
----
-name: no_sched_barrier
-tracksRegLiveness: true
-body: |
-  bb.0:
-    ; CHECK-LABEL: name: no_sched_barrier
-    ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
-    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
-    ; CHECK-NEXT: S_NOP 0
-    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
-    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
-    ; CHECK-NEXT: S_ENDPGM 0
-    %0:sreg_64 = IMPLICIT_DEF
-    %1:vgpr_32 = IMPLICIT_DEF
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
-    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
-    S_NOP 0
-    %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
-    GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
-    S_ENDPGM 0
-...
-
----
-name: sched_barrier_0
-tracksRegLiveness: true
-body: |
-  bb.0:
-    ; CHECK-LABEL: name: sched_barrier_0
-    ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
-    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
-    ; CHECK-NEXT: S_NOP 0
-    ; CHECK-NEXT: SCHED_BARRIER 0
-    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
-    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
-    ; CHECK-NEXT: S_ENDPGM 0
-    %0:sreg_64 = IMPLICIT_DEF
-    %1:vgpr_32 = IMPLICIT_DEF
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
-    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
-    S_NOP 0
-    SCHED_BARRIER 0
-    %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
-    GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
-    S_ENDPGM 0
-...
-
----
-name: sched_barrier_1
-tracksRegLiveness: true
-body: |
-  bb.0:
-    ; CHECK-LABEL: name: sched_barrier_1
-    ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
-    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
-    ; CHECK-NEXT: SCHED_BARRIER 1
-    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
-    ; CHECK-NEXT: S_NOP 0
-    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
-    ; CHECK-NEXT: S_ENDPGM 0
-    %0:sreg_64 = IMPLICIT_DEF
-    %1:vgpr_32 = IMPLICIT_DEF
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
-    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
-    S_NOP 0
-    SCHED_BARRIER 1
-    %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
-    GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
-    S_ENDPGM 0
-...
author	Austin Kerbow <Austin.Kerbow@amd.com>
	Fri, 3 Jun 2022 18:35:47 +0000 (11:35 -0700)
committer	Austin Kerbow <Austin.Kerbow@amd.com>
	Wed, 15 Jun 2022 05:03:05 +0000 (22:03 -0700)
llvm/include/llvm/IR/IntrinsicsAMDGPU.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp	[new file with mode: 0644]	patch \| blob
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h	[moved from llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.h with 82% similarity]	patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.cpp	[deleted file]	patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/CMakeLists.txt		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir	[moved from llvm/test/CodeGen/AMDGPU/mfma-igrouplp-dag-mutation.mir with 80% similarity]	patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/AMDGPU/sched_barrier.mir	[deleted file]	patch \| blob \| history