--- /dev/null
+//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file This file defines a set of schedule DAG mutations that can be used to
+// override default scheduler behavior to enforce specific scheduling patterns.
+// They should be used in cases where runtime performance considerations such as
+// inter-wavefront interactions, mean that compile-time heuristics cannot
+// predict the optimal instruction ordering, or in kernels where optimum
+// instruction scheduling is important enough to warrant manual intervention.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUIGroupLP.h"
+#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-scheduler"
+
+namespace {
+
+static cl::opt<bool>
+ EnableIGroupLP("amdgpu-igrouplp",
+ cl::desc("Enable construction of Instruction Groups and "
+ "their ordering for scheduling"),
+ cl::init(false));
+
+static cl::opt<Optional<unsigned>>
+ VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None),
+ cl::Hidden,
+ cl::desc("The maximum number of instructions to include "
+ "in VMEM group."));
+
+static cl::opt<Optional<unsigned>>
+ MFMAGroupMaxSize("amdgpu-igrouplp-mfma-group-size", cl::init(None),
+ cl::Hidden,
+ cl::desc("The maximum number of instructions to include "
+ "in MFMA group."));
+
+static cl::opt<Optional<unsigned>>
+ LDRGroupMaxSize("amdgpu-igrouplp-ldr-group-size", cl::init(None),
+ cl::Hidden,
+ cl::desc("The maximum number of instructions to include "
+ "in lds/gds read group."));
+
+static cl::opt<Optional<unsigned>>
+ LDWGroupMaxSize("amdgpu-igrouplp-ldw-group-size", cl::init(None),
+ cl::Hidden,
+ cl::desc("The maximum number of instructions to include "
+ "in lds/gds write group."));
+
+typedef function_ref<bool(const MachineInstr &, const SIInstrInfo *)>
+ CanAddMIFn;
+
+// Classify instructions into groups to enable fine tuned control over the
+// scheduler. These groups may be more specific than current SchedModel
+// instruction classes.
+class SchedGroup {
+private:
+ // Function that returns true if a non-bundle MI may be inserted into this
+ // group.
+ const CanAddMIFn canAddMI;
+
+ // Maximum number of SUnits that can be added to this group.
+ Optional<unsigned> MaxSize;
+
+ // Collection of SUnits that are classified as members of this group.
+ SmallVector<SUnit *, 32> Collection;
+
+ ScheduleDAGInstrs *DAG;
+
+ void tryAddEdge(SUnit *A, SUnit *B) {
+ if (A != B && DAG->canAddEdge(B, A)) {
+ DAG->addEdge(B, SDep(A, SDep::Artificial));
+ LLVM_DEBUG(dbgs() << "Adding edge...\n"
+ << "from: SU(" << A->NodeNum << ") " << *A->getInstr()
+ << "to: SU(" << B->NodeNum << ") " << *B->getInstr());
+ }
+ }
+
+public:
+ // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
+ // MakePred is true, SU will be a predecessor of the SUnits in this
+ // SchedGroup, otherwise SU will be a successor.
+ void link(SUnit &SU, bool MakePred = false) {
+ for (auto A : Collection) {
+ SUnit *B = &SU;
+ if (MakePred)
+ std::swap(A, B);
+
+ tryAddEdge(A, B);
+ }
+ }
+
+ // Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use
+ // the predicate to determine whether SU should be a predecessor (P = true)
+ // or a successor (P = false) of this SchedGroup.
+ void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P) {
+ for (auto A : Collection) {
+ SUnit *B = &SU;
+ if (P(A, B))
+ std::swap(A, B);
+
+ tryAddEdge(A, B);
+ }
+ }
+
+ // Add DAG dependencies such that SUnits in this group shall be ordered
+ // before SUnits in OtherGroup.
+ void link(SchedGroup &OtherGroup) {
+ for (auto B : OtherGroup.Collection)
+ link(*B);
+ }
+
+ // Returns true if no more instructions may be added to this group.
+ bool isFull() { return MaxSize.hasValue() && Collection.size() >= *MaxSize; }
+
+ // Returns true if SU can be added to this SchedGroup.
+ bool canAddSU(SUnit &SU, const SIInstrInfo *TII) {
+ if (isFull())
+ return false;
+
+ MachineInstr &MI = *SU.getInstr();
+ if (MI.getOpcode() != TargetOpcode::BUNDLE)
+ return canAddMI(MI, TII);
+
+ // Special case for bundled MIs.
+ const MachineBasicBlock *MBB = MI.getParent();
+ MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
+ while (E != MBB->end() && E->isBundledWithPred())
+ ++E;
+
+ // Return true if all of the bundled MIs can be added to this group.
+ return std::all_of(
+ B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); });
+ }
+
+ void add(SUnit &SU) { Collection.push_back(&SU); }
+
+ SchedGroup(CanAddMIFn canAddMI, Optional<unsigned> MaxSize,
+ ScheduleDAGInstrs *DAG)
+ : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {}
+};
+
+bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return TII->isMFMA(MI);
+}
+
+bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return TII->isVALU(MI) && !TII->isMFMA(MI);
+}
+
+bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return TII->isSALU(MI);
+}
+
+bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI));
+}
+
+bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return MI.mayLoad() &&
+ (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
+}
+
+bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return MI.mayStore() &&
+ (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
+}
+
+bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return MI.mayStore() && TII->isDS(MI);
+}
+
+bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return MI.mayLoad() && TII->isDS(MI);
+}
+
+class IGroupLPDAGMutation : public ScheduleDAGMutation {
+public:
+ const SIInstrInfo *TII;
+ ScheduleDAGMI *DAG;
+
+ IGroupLPDAGMutation() = default;
+ void apply(ScheduleDAGInstrs *DAGInstrs) override;
+};
+
+// DAG mutation that coordinates with the SCHED_BARRIER instruction and
+// corresponding builtin. The mutation adds edges from specific instruction
+// classes determined by the SCHED_BARRIER mask so that they cannot be
+// scheduled around the SCHED_BARRIER.
+class SchedBarrierDAGMutation : public ScheduleDAGMutation {
+private:
+ const SIInstrInfo *TII;
+
+ ScheduleDAGMI *DAG;
+
+ // Components of the mask that determines which instructions may not be
+ // scheduled across the SCHED_BARRIER.
+ enum class SchedBarrierMasks {
+ NONE = 0u,
+ ALU = 1u << 0,
+ VALU = 1u << 1,
+ SALU = 1u << 2,
+ MFMA = 1u << 3,
+ VMEM = 1u << 4,
+ VMEM_READ = 1u << 5,
+ VMEM_WRITE = 1u << 6,
+ DS = 1u << 7,
+ DS_READ = 1u << 8,
+ DS_WRITE = 1u << 9,
+ LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE)
+ };
+
+ // Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a
+ // region.
+ //
+ std::unique_ptr<SchedGroup> MFMASchedGroup = nullptr;
+ std::unique_ptr<SchedGroup> VALUSchedGroup = nullptr;
+ std::unique_ptr<SchedGroup> SALUSchedGroup = nullptr;
+ std::unique_ptr<SchedGroup> VMEMReadSchedGroup = nullptr;
+ std::unique_ptr<SchedGroup> VMEMWriteSchedGroup = nullptr;
+ std::unique_ptr<SchedGroup> DSWriteSchedGroup = nullptr;
+ std::unique_ptr<SchedGroup> DSReadSchedGroup = nullptr;
+
+ // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
+ // not be reordered accross the SCHED_BARRIER.
+ void getSchedGroupsFromMask(int32_t Mask,
+ SmallVectorImpl<SchedGroup *> &SchedGroups);
+
+ // Add DAG edges that enforce SCHED_BARRIER ordering.
+ void addSchedBarrierEdges(SUnit &SU);
+
+ // Classify instructions and add them to the SchedGroup.
+ void initSchedGroup(SchedGroup *SG);
+
+ // Remove all existing edges from a SCHED_BARRIER.
+ void resetSchedBarrierEdges(SUnit &SU);
+
+public:
+ void apply(ScheduleDAGInstrs *DAGInstrs) override;
+
+ SchedBarrierDAGMutation() = default;
+};
+
+void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
+ const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
+ const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
+ if (!TSchedModel || DAG->SUnits.empty())
+ return;
+
+ LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
+
+ // The order of InstructionGroups in this vector defines the
+ // order in which edges will be added. In other words, given the
+ // present ordering, we will try to make each VMEMRead instruction
+ // a predecessor of each DSRead instruction, and so on.
+ SmallVector<SchedGroup, 4> PipelineOrderGroups = {
+ SchedGroup(&isVMEMSGMember, VMEMGroupMaxSize, DAG),
+ SchedGroup(&isDSReadSGMember, LDRGroupMaxSize, DAG),
+ SchedGroup(&isMFMASGMember, MFMAGroupMaxSize, DAG),
+ SchedGroup(&isDSWriteSGMember, LDWGroupMaxSize, DAG)};
+
+ for (SUnit &SU : DAG->SUnits) {
+ LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU));
+ for (auto &SG : PipelineOrderGroups)
+ if (SG.canAddSU(SU, TII))
+ SG.add(SU);
+ }
+
+ for (unsigned i = 0; i < PipelineOrderGroups.size() - 1; i++) {
+ auto &GroupA = PipelineOrderGroups[i];
+ for (unsigned j = i + 1; j < PipelineOrderGroups.size(); j++) {
+ auto &GroupB = PipelineOrderGroups[j];
+ GroupA.link(GroupB);
+ }
+ }
+}
+
+void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
+ const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
+ if (!TSchedModel || DAGInstrs->SUnits.empty())
+ return;
+
+ LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n");
+
+ const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
+ for (auto &SU : DAG->SUnits)
+ if (SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER)
+ addSchedBarrierEdges(SU);
+}
+
+void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
+ MachineInstr &MI = *SchedBarrier.getInstr();
+ assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
+ // Remove all existing edges from the SCHED_BARRIER that were added due to the
+ // instruction having side effects.
+ resetSchedBarrierEdges(SchedBarrier);
+ SmallVector<SchedGroup *, 4> SchedGroups;
+ int32_t Mask = MI.getOperand(0).getImm();
+ getSchedGroupsFromMask(Mask, SchedGroups);
+ for (auto SG : SchedGroups)
+ SG->link(
+ SchedBarrier, (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
+ const SUnit *A, const SUnit *B) {
+ return A->NodeNum > B->NodeNum;
+ });
+}
+
+void SchedBarrierDAGMutation::getSchedGroupsFromMask(
+ int32_t Mask, SmallVectorImpl<SchedGroup *> &SchedGroups) {
+ SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask;
+ // See IntrinsicsAMDGPU.td for an explanation of these masks and their
+ // mappings.
+ //
+ if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE &&
+ (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
+ if (!VALUSchedGroup) {
+ VALUSchedGroup = std::make_unique<SchedGroup>(isVALUSGMember, None, DAG);
+ initSchedGroup(VALUSchedGroup.get());
+ }
+
+ SchedGroups.push_back(VALUSchedGroup.get());
+ }
+
+ if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE &&
+ (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
+ if (!SALUSchedGroup) {
+ SALUSchedGroup = std::make_unique<SchedGroup>(isSALUSGMember, None, DAG);
+ initSchedGroup(SALUSchedGroup.get());
+ }
+
+ SchedGroups.push_back(SALUSchedGroup.get());
+ }
+
+ if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE &&
+ (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
+ if (!MFMASchedGroup) {
+ MFMASchedGroup = std::make_unique<SchedGroup>(isMFMASGMember, None, DAG);
+ initSchedGroup(MFMASchedGroup.get());
+ }
+
+ SchedGroups.push_back(MFMASchedGroup.get());
+ }
+
+ if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE &&
+ (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
+ if (!VMEMReadSchedGroup) {
+ VMEMReadSchedGroup =
+ std::make_unique<SchedGroup>(isVMEMReadSGMember, None, DAG);
+ initSchedGroup(VMEMReadSchedGroup.get());
+ }
+
+ SchedGroups.push_back(VMEMReadSchedGroup.get());
+ }
+
+ if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE &&
+ (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
+ if (!VMEMWriteSchedGroup) {
+ VMEMWriteSchedGroup =
+ std::make_unique<SchedGroup>(isVMEMWriteSGMember, None, DAG);
+ initSchedGroup(VMEMWriteSchedGroup.get());
+ }
+
+ SchedGroups.push_back(VMEMWriteSchedGroup.get());
+ }
+
+ if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE &&
+ (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
+ if (!DSReadSchedGroup) {
+ DSReadSchedGroup =
+ std::make_unique<SchedGroup>(isDSReadSGMember, None, DAG);
+ initSchedGroup(DSReadSchedGroup.get());
+ }
+
+ SchedGroups.push_back(DSReadSchedGroup.get());
+ }
+
+ if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE &&
+ (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
+ if (!DSWriteSchedGroup) {
+ DSWriteSchedGroup =
+ std::make_unique<SchedGroup>(isDSWriteSGMember, None, DAG);
+ initSchedGroup(DSWriteSchedGroup.get());
+ }
+
+ SchedGroups.push_back(DSWriteSchedGroup.get());
+ }
+}
+
+void SchedBarrierDAGMutation::initSchedGroup(SchedGroup *SG) {
+ assert(SG);
+ for (auto &SU : DAG->SUnits)
+ if (SG->canAddSU(SU, TII))
+ SG->add(SU);
+}
+
+void SchedBarrierDAGMutation::resetSchedBarrierEdges(SUnit &SU) {
+ assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER);
+ for (auto &P : SU.Preds)
+ SU.removePred(P);
+
+ for (auto &S : SU.Succs) {
+ for (auto &SP : S.getSUnit()->Preds) {
+ if (SP.getSUnit() == &SU) {
+ S.getSUnit()->removePred(SP);
+ }
+ }
+ }
+}
+
+} // namespace
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
+ return EnableIGroupLP ? std::make_unique<IGroupLPDAGMutation>() : nullptr;
+}
+
+std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation() {
+ return std::make_unique<SchedBarrierDAGMutation>();
+}
+
+} // end namespace llvm
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -misched-cluster=false -run-pass=postmisched -verify-misched -o - %s | FileCheck %s
+
+--- |
+ define amdgpu_kernel void @no_sched_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_barrier_mask_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_barrier_mask_1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+
+ !0 = distinct !{!0}
+ !1 = !{!1, !0}
+...
+
+---
+name: no_sched_barrier
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: no_sched_barrier
+ ; CHECK: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+ ; CHECK-NEXT: renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+ ; CHECK-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 {
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: S_ENDPGM 0
+ renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+ renamable $vgpr0 = IMPLICIT_DEF
+ BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+ renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ }
+ renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+ renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+ BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 {
+ GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ }
+ S_ENDPGM 0
+...
+
+# MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_barrier_mask_0
+ ; CHECK: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+ ; CHECK-NEXT: renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+ ; CHECK-NEXT: SCHED_BARRIER 0
+ ; CHECK-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+ ; CHECK-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 {
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: S_ENDPGM 0
+ renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+ renamable $vgpr0 = IMPLICIT_DEF
+ BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+ renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ }
+ renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+ SCHED_BARRIER 0
+ renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+ BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 {
+ GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ }
+ S_ENDPGM 0
+...
+
+# MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be
+# scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass.
+
+---
+name: sched_barrier_mask_1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_barrier_mask_1
+ ; CHECK: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+ ; CHECK-NEXT: renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: SCHED_BARRIER 1
+ ; CHECK-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+ ; CHECK-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 {
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: S_ENDPGM 0
+ renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+ renamable $vgpr0 = IMPLICIT_DEF
+ BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+ renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ }
+ renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+ SCHED_BARRIER 1
+ renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+ BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 {
+ GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ }
+ S_ENDPGM 0
+...
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -misched-cluster=false -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
+
+--- |
+ define amdgpu_kernel void @no_sched_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_barrier_mask_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_barrier_mask_1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_barrier_mask_2(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_barrier_mask_4(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_barrier_mask_8(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_barrier_mask_16(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_barrier_mask_32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_barrier_mask_64(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_barrier_mask_128(i32 addrspace(3)* noalias %out, i32 addrspace(3)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_barrier_mask_256(i32 addrspace(3)* noalias %out, i32 addrspace(3)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_barrier_mask_512(i32 addrspace(3)* noalias %out, i32 addrspace(3)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_barrier_masks_8_12(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_barrier_mask_4_bundle(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_barrier_mask_0_bundle(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+
+ !0 = distinct !{!0}
+ !1 = !{!1, !0}
+...
+
+---
+name: no_sched_barrier
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: no_sched_barrier
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_NOP 0
+ %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_ENDPGM 0
+...
+
+# MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_barrier_mask_0
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: SCHED_BARRIER 0
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_NOP 0
+ SCHED_BARRIER 0
+ %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_ENDPGM 0
+...
+
+# MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be
+# scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass.
+
+---
+name: sched_barrier_mask_1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_barrier_mask_1
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: SCHED_BARRIER 1
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_NOP 0
+ SCHED_BARRIER 1
+ %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_ENDPGM 0
+...
+
+# MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_barrier_mask_2
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: SCHED_BARRIER 2
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]]
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %3:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %2, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %1, implicit $exec
+ %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %1, implicit $exec
+ S_NOP 0
+ SCHED_BARRIER 2
+ %6:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %7:vgpr_32 = nsw V_MUL_LO_U32_e64 %6, %6, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %7, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_ENDPGM 0, implicit %4, implicit %5
+...
+
+# MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_4
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_barrier_mask_4
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: SCHED_BARRIER 4
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]]
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:areg_128 = IMPLICIT_DEF
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+ %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+ S_NOP 0
+ %7:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %2, 0, 0, 0, implicit $mode, implicit $exec
+ %8:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %7, 0, 0, 0, implicit $mode, implicit $exec
+ %9:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %8, 0, 0, 0, implicit $mode, implicit $exec
+ %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %9, 0, 0, 0, implicit $mode, implicit $exec
+ %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %10, 0, 0, 0, implicit $mode, implicit $exec
+ SCHED_BARRIER 4
+ %12:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %13:vgpr_32 = nsw V_MUL_LO_U32_e64 %12, %12, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %13, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_ENDPGM 0, implicit %5, implicit %6, implicit %11
+...
+
+# MASK = 0x0000 0008: MFMA instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_8
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_barrier_mask_8
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: SCHED_BARRIER 8
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]]
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:areg_128 = IMPLICIT_DEF
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+ %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+ S_NOP 0
+ %7:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %2, 0, 0, 0, implicit $mode, implicit $exec
+ %8:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %7, 0, 0, 0, implicit $mode, implicit $exec
+ %9:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %8, 0, 0, 0, implicit $mode, implicit $exec
+ %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %9, 0, 0, 0, implicit $mode, implicit $exec
+ %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %10, 0, 0, 0, implicit $mode, implicit $exec
+ SCHED_BARRIER 8
+ %12:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %13:vgpr_32 = nsw V_MUL_LO_U32_e64 %12, %12, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %13, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_ENDPGM 0, implicit %5, implicit %6, implicit %11
+...
+
+# MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_16
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_barrier_mask_16
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+ ; CHECK-NEXT: SCHED_BARRIER 16
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_NOP 0
+ SCHED_BARRIER 16
+ %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_ENDPGM 0
+...
+
+# MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_barrier_mask_32
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: SCHED_BARRIER 32
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_NOP 0
+ SCHED_BARRIER 32
+ %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_ENDPGM 0
+...
+
+# MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_barrier_mask_64
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+ ; CHECK-NEXT: SCHED_BARRIER 64
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_NOP 0
+ SCHED_BARRIER 64
+ %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_ENDPGM 0
+...
+
+# MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_128
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_barrier_mask_128
+ ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+ ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_]], [[DS_READ_U16_gfx9_]], implicit $exec
+ ; CHECK-NEXT: SCHED_BARRIER 128
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec
+ ; CHECK-NEXT: dead %0:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+ ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_1]], [[V_MUL_LO_U32_e64_]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+ %3:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %2, implicit $exec
+ DS_WRITE_B32 %3, %1, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+ S_NOP 0
+ SCHED_BARRIER 128
+ %4:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+ %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %4, %4, implicit $exec
+ DS_WRITE_B32 %5, %3, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+ S_ENDPGM 0
+...
+
+# MASK = 0x0000 0100: ALL DS read instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_256
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_barrier_mask_256
+ ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_]], [[DS_READ_U16_gfx9_]], implicit $exec
+ ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+ ; CHECK-NEXT: SCHED_BARRIER 256
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec
+ ; CHECK-NEXT: dead %0:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_1]], [[V_MUL_LO_U32_e64_]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+ %3:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %2, implicit $exec
+ DS_WRITE_B32 %3, %1, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+ S_NOP 0
+ SCHED_BARRIER 256
+ %4:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+ %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %4, %4, implicit $exec
+ DS_WRITE_B32 %5, %3, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+ S_ENDPGM 0
+...
+
+# MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_512
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_barrier_mask_512
+ ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_]], [[DS_READ_U16_gfx9_]], implicit $exec
+ ; CHECK-NEXT: SCHED_BARRIER 512
+ ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec
+ ; CHECK-NEXT: dead %0:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+ ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_1]], [[V_MUL_LO_U32_e64_]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+ %3:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %2, implicit $exec
+ DS_WRITE_B32 %3, %1, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+ S_NOP 0
+ SCHED_BARRIER 512
+ %4:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
+ %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %4, %4, implicit $exec
+ DS_WRITE_B32 %5, %3, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
+ S_ENDPGM 0
+...
+
+# MASK = 0x0000 0008: MFMA instructions may be scheduled across SCHED_BARRIER.
+# MASK = 0x0000 000C: MFMA and SALU may be scheduled across SCHED_BARRIER.
+# Check that S_NOP can move moved before the first SCHED_BARRIER but not the second.
+
+---
+name: sched_barrier_masks_8_12
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_barrier_masks_8_12
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: SCHED_BARRIER 12
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: SCHED_BARRIER 8
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]]
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:areg_128 = IMPLICIT_DEF
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+ %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+ SCHED_BARRIER 12
+ %7:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %2, 0, 0, 0, implicit $mode, implicit $exec
+ %8:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %7, 0, 0, 0, implicit $mode, implicit $exec
+ %9:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %8, 0, 0, 0, implicit $mode, implicit $exec
+ %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %9, 0, 0, 0, implicit $mode, implicit $exec
+ %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %10, 0, 0, 0, implicit $mode, implicit $exec
+ S_NOP 0
+ SCHED_BARRIER 8
+ S_NOP 0
+ %12:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %13:vgpr_32 = nsw V_MUL_LO_U32_e64 %12, %12, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %13, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_ENDPGM 0, implicit %5, implicit %6, implicit %11
+...
+
+# MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_4_bundle
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_barrier_mask_4_bundle
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+ ; CHECK-NEXT: BUNDLE [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[GLOBAL_LOAD_DWORD_SADDR1]] {
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: SCHED_BARRIER 4
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+ %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+ SCHED_BARRIER 4
+ BUNDLE implicit %3, %5 {
+ S_NOP 0
+ S_NOP 0
+ }
+ GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_ENDPGM 0
+...
+
+# MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER.
+
+---
+name: sched_barrier_mask_0_bundle
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_barrier_mask_0_bundle
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+ ; CHECK-NEXT: SCHED_BARRIER 0
+ ; CHECK-NEXT: BUNDLE [[GLOBAL_LOAD_DWORD_SADDR1]], implicit [[GLOBAL_LOAD_DWORD_SADDR]] {
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+ %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+ SCHED_BARRIER 0
+ BUNDLE implicit %3, %5 {
+ S_NOP 0
+ S_NOP 0
+ }
+ GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_ENDPGM 0
+...