BUILTIN(__builtin_amdgcn_s_barrier, "v", "n")
BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n")
BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n")
+BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi", "n")
BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n")
BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n")
BUILTIN(__builtin_amdgcn_ds_gws_init, "vUiUi", "n")
__builtin_amdgcn_sched_barrier(15);
}
+// CHECK-LABEL: @test_sched_group_barrier
+// CHECK: call void @llvm.amdgcn.sched.group.barrier(i32 0, i32 1, i32 2)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier(i32 1, i32 2, i32 4)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier(i32 4, i32 8, i32 16)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier(i32 15, i32 10000, i32 -1)
+void test_sched_group_barrier()
+{
+ __builtin_amdgcn_sched_group_barrier(0, 1, 2);
+ __builtin_amdgcn_sched_group_barrier(1, 2, 4);
+ __builtin_amdgcn_sched_group_barrier(4, 8, 16);
+ __builtin_amdgcn_sched_group_barrier(15, 10000, -1);
+}
+
// CHECK-LABEL: @test_s_sleep
// CHECK: call void @llvm.amdgcn.s.sleep(i32 1)
// CHECK: call void @llvm.amdgcn.s.sleep(i32 15)
__builtin_amdgcn_sched_barrier(x); // expected-error {{argument to '__builtin_amdgcn_sched_barrier' must be a constant integer}}
}
+void test_sched_group_barrier(int x)
+{
+ __builtin_amdgcn_sched_group_barrier(x, 0, 1); // expected-error {{argument to '__builtin_amdgcn_sched_group_barrier' must be a constant integer}}
+ __builtin_amdgcn_sched_group_barrier(0, x, 1); // expected-error {{argument to '__builtin_amdgcn_sched_group_barrier' must be a constant integer}}
+ __builtin_amdgcn_sched_group_barrier(0, 1, x); // expected-error {{argument to '__builtin_amdgcn_sched_group_barrier' must be a constant integer}}
+}
+
void test_sicmp_i32(global ulong* out, int a, int b, uint c)
{
*out = __builtin_amdgcn_sicmp(a, b, c); // expected-error {{argument to '__builtin_amdgcn_sicmp' must be a constant integer}}
Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
IntrWillReturn]>;
+// The first parameter is a mask that determines the types of instructions that
+// you would like to synchronize around and add to a scheduling group. The
+// values of the mask are defined above for sched_barrier. These instructions
+// will be selected from the bottom up starting from the sched_group_barrier's
+// location during instruction scheduling. The second parameter is the number of
+// matching instructions that will be associated with this sched_group_barrier.
+// The third parameter is an identifier which is used to describe what other
+// sched_group_barriers should be synchronized with.
+def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">,
+ Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects,
+ IntrConvergent, IntrWillReturn]>;
+
def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">,
Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/TargetOpcodes.h"
cl::desc("The maximum number of instructions to include "
"in lds/gds write group."));
-typedef function_ref<bool(const MachineInstr &, const SIInstrInfo *)>
- CanAddMIFn;
+// Components of the mask that determines which instruction types may be may be
+// classified into a SchedGroup.
+enum class SchedGroupMask {
+ NONE = 0u,
+ ALU = 1u << 0,
+ VALU = 1u << 1,
+ SALU = 1u << 2,
+ MFMA = 1u << 3,
+ VMEM = 1u << 4,
+ VMEM_READ = 1u << 5,
+ VMEM_WRITE = 1u << 6,
+ DS = 1u << 7,
+ DS_READ = 1u << 8,
+ DS_WRITE = 1u << 9,
+ ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
+ DS_READ | DS_WRITE,
+ LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
+};
// Classify instructions into groups to enable fine tuned control over the
// scheduler. These groups may be more specific than current SchedModel
// instruction classes.
class SchedGroup {
private:
- // Function that returns true if a non-bundle MI may be inserted into this
- // group.
- const CanAddMIFn canAddMI;
+ // Mask that defines which instruction types can be classified into this
+ // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
+ // and SCHED_GROUP_BARRIER.
+ SchedGroupMask SGMask;
// Maximum number of SUnits that can be added to this group.
Optional<unsigned> MaxSize;
+ // SchedGroups will only synchronize with other SchedGroups that have the same
+ // SyncID.
+ int SyncID = 0;
+
// Collection of SUnits that are classified as members of this group.
SmallVector<SUnit *, 32> Collection;
ScheduleDAGInstrs *DAG;
- void tryAddEdge(SUnit *A, SUnit *B) {
- if (A != B && DAG->canAddEdge(B, A)) {
- DAG->addEdge(B, SDep(A, SDep::Artificial));
- LLVM_DEBUG(dbgs() << "Adding edge...\n"
- << "from: SU(" << A->NodeNum << ") " << *A->getInstr()
- << "to: SU(" << B->NodeNum << ") " << *B->getInstr());
- }
+ const SIInstrInfo *TII;
+
+ // Try to add and edge from SU A to SU B.
+ bool tryAddEdge(SUnit *A, SUnit *B);
+
+ // Use SGMask to determine whether we can classify MI as a member of this
+ // SchedGroup object.
+ bool canAddMI(const MachineInstr &MI) const;
+
+ // Returns true if SU can be added to this SchedGroup.
+ bool canAddSU(SUnit &SU) const;
+
+ // Returns true if no more instructions may be added to this group.
+ bool isFull() const;
+
+ // Add SU to the SchedGroup.
+ void add(SUnit &SU) {
+ LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
+ << format_hex((int)SGMask, 10, true) << " adding "
+ << *SU.getInstr());
+ Collection.push_back(&SU);
}
public:
// Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
// MakePred is true, SU will be a predecessor of the SUnits in this
// SchedGroup, otherwise SU will be a successor.
- void link(SUnit &SU, bool MakePred = false) {
- for (auto A : Collection) {
- SUnit *B = &SU;
- if (MakePred)
- std::swap(A, B);
-
- tryAddEdge(A, B);
- }
- }
+ void link(SUnit &SU, bool MakePred = false);
// Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use
// the predicate to determine whether SU should be a predecessor (P = true)
// or a successor (P = false) of this SchedGroup.
- void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P) {
- for (auto A : Collection) {
- SUnit *B = &SU;
- if (P(A, B))
- std::swap(A, B);
-
- tryAddEdge(A, B);
- }
- }
+ void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);
// Add DAG dependencies such that SUnits in this group shall be ordered
// before SUnits in OtherGroup.
- void link(SchedGroup &OtherGroup) {
- for (auto B : OtherGroup.Collection)
- link(*B);
- }
+ void link(SchedGroup &OtherGroup);
// Returns true if no more instructions may be added to this group.
bool isFull() { return MaxSize && Collection.size() >= *MaxSize; }
- // Returns true if SU can be added to this SchedGroup.
- bool canAddSU(SUnit &SU, const SIInstrInfo *TII) {
- if (isFull())
- return false;
-
- MachineInstr &MI = *SU.getInstr();
- if (MI.getOpcode() != TargetOpcode::BUNDLE)
- return canAddMI(MI, TII);
+ // Identify and add all relevant SUs from the DAG to this SchedGroup.
+ void initSchedGroup();
- // Special case for bundled MIs.
- const MachineBasicBlock *MBB = MI.getParent();
- MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
- while (E != MBB->end() && E->isBundledWithPred())
- ++E;
+ // Add instructions to the SchedGroup bottom up starting from RIter.
+ // ConflictedInstrs is a set of instructions that should not be added to the
+ // SchedGroup even when the other conditions for adding it are satisfied.
+ // RIter will be added to the SchedGroup as well, and dependencies will be
+ // added so that RIter will always be scheduled at the end of the group.
+ void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
+ DenseSet<SUnit *> &ConflictedInstrs);
- // Return true if all of the bundled MIs can be added to this group.
- return std::all_of(
- B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); });
- }
+ int getSyncID() { return SyncID; }
- void add(SUnit &SU) { Collection.push_back(&SU); }
+ SchedGroup(SchedGroupMask SGMask, Optional<unsigned> MaxSize,
+ ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+ : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {}
- SchedGroup(CanAddMIFn canAddMI, Optional<unsigned> MaxSize,
- ScheduleDAGInstrs *DAG)
- : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {}
+ SchedGroup(SchedGroupMask SGMask, Optional<unsigned> MaxSize, int SyncID,
+ ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+ : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {}
};
-bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
- return TII->isMFMA(MI);
-}
-
-bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
- return TII->isVALU(MI) && !TII->isMFMA(MI);
-}
-
-bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
- return TII->isSALU(MI);
-}
-
-bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
- return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI));
-}
-
-bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
- return MI.mayLoad() &&
- (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
-}
-
-bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
- return MI.mayStore() &&
- (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
-}
-
-bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
- return MI.mayStore() && TII->isDS(MI);
-}
-
-bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
- return MI.mayLoad() && TII->isDS(MI);
-}
-
class IGroupLPDAGMutation : public ScheduleDAGMutation {
public:
const SIInstrInfo *TII;
// DAG mutation that coordinates with the SCHED_BARRIER instruction and
// corresponding builtin. The mutation adds edges from specific instruction
// classes determined by the SCHED_BARRIER mask so that they cannot be
-// scheduled around the SCHED_BARRIER.
class SchedBarrierDAGMutation : public ScheduleDAGMutation {
private:
const SIInstrInfo *TII;
ScheduleDAGMI *DAG;
- // Components of the mask that determines which instructions may not be
- // scheduled across the SCHED_BARRIER.
- enum class SchedBarrierMasks {
- NONE = 0u,
- ALU = 1u << 0,
- VALU = 1u << 1,
- SALU = 1u << 2,
- MFMA = 1u << 3,
- VMEM = 1u << 4,
- VMEM_READ = 1u << 5,
- VMEM_WRITE = 1u << 6,
- DS = 1u << 7,
- DS_READ = 1u << 8,
- DS_WRITE = 1u << 9,
- LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE)
- };
-
- // Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a
- // region.
- //
- std::unique_ptr<SchedGroup> MFMASchedGroup = nullptr;
- std::unique_ptr<SchedGroup> VALUSchedGroup = nullptr;
- std::unique_ptr<SchedGroup> SALUSchedGroup = nullptr;
- std::unique_ptr<SchedGroup> VMEMReadSchedGroup = nullptr;
- std::unique_ptr<SchedGroup> VMEMWriteSchedGroup = nullptr;
- std::unique_ptr<SchedGroup> DSWriteSchedGroup = nullptr;
- std::unique_ptr<SchedGroup> DSReadSchedGroup = nullptr;
+ // Organize lists of SchedGroups by their SyncID. SchedGroups /
+ // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
+ // between then.
+ DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroupsMap;
- // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
- // not be reordered accross the SCHED_BARRIER.
- void getSchedGroupsFromMask(int32_t Mask,
- SmallVectorImpl<SchedGroup *> &SchedGroups);
+ // Used to track instructions that are already to added to a different
+ // SchedGroup with the same SyncID.
+ DenseMap<int, DenseSet<SUnit *>> SyncedInstrsMap;
// Add DAG edges that enforce SCHED_BARRIER ordering.
void addSchedBarrierEdges(SUnit &SU);
- // Classify instructions and add them to the SchedGroup.
- void initSchedGroup(SchedGroup *SG);
-
- // Remove all existing edges from a SCHED_BARRIER.
- void resetSchedBarrierEdges(SUnit &SU);
+ // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
+ // not be reordered accross the SCHED_BARRIER. This is used for the base
+ // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
+ // SCHED_BARRIER will always block all instructions that can be classified
+ // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
+ // and may only synchronize with some SchedGroups. Returns the inverse of
+ // Mask. SCHED_BARRIER's mask describes which instruction types should be
+ // allowed to be scheduled across it. Invert the mask to get the
+ // SchedGroupMask of instructions that should be barred.
+ SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
+
+ // Create SchedGroups for a SCHED_GROUP_BARRIER.
+ void initSchedGroupBarrier(std::vector<SUnit>::reverse_iterator RIter);
+
+ // Add DAG edges that try to enforce ordering defined by SCHED_GROUP_BARRIER
+ // instructions.
+ void addSchedGroupBarrierEdges();
public:
void apply(ScheduleDAGInstrs *DAGInstrs) override;
SchedBarrierDAGMutation() = default;
};
+bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
+ if (A != B && DAG->canAddEdge(B, A)) {
+ DAG->addEdge(B, SDep(A, SDep::Artificial));
+ LLVM_DEBUG(dbgs() << "Adding edge...\n"
+ << "from: SU(" << A->NodeNum << ") " << *A->getInstr()
+ << "to: SU(" << B->NodeNum << ") " << *B->getInstr());
+ return true;
+ }
+ return false;
+}
+
+bool SchedGroup::canAddMI(const MachineInstr &MI) const {
+ bool Result = false;
+ if (MI.isMetaInstruction() || MI.getOpcode() == AMDGPU::SCHED_BARRIER ||
+ MI.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
+ Result = false;
+
+ else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
+ (TII->isVALU(MI) || TII->isMFMA(MI) || TII->isSALU(MI)))
+ Result = true;
+
+ else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
+ TII->isVALU(MI) && !TII->isMFMA(MI))
+ Result = true;
+
+ else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
+ TII->isSALU(MI))
+ Result = true;
+
+ else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
+ TII->isMFMA(MI))
+ Result = true;
+
+ else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
+ (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+ Result = true;
+
+ else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
+ MI.mayLoad() &&
+ (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+ Result = true;
+
+ else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
+ MI.mayStore() &&
+ (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+ Result = true;
+
+ else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
+ TII->isDS(MI))
+ Result = true;
+
+ else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
+ MI.mayLoad() && TII->isDS(MI))
+ Result = true;
+
+ else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
+ MI.mayStore() && TII->isDS(MI))
+ Result = true;
+
+ LLVM_DEBUG(
+ dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
+ << (Result ? " could classify " : " unable to classify ") << MI);
+
+ return Result;
+}
+
+void SchedGroup::link(SUnit &SU, bool MakePred) {
+ for (auto A : Collection) {
+ SUnit *B = &SU;
+ if (MakePred)
+ std::swap(A, B);
+
+ tryAddEdge(A, B);
+ }
+}
+
+void SchedGroup::link(SUnit &SU,
+ function_ref<bool(const SUnit *A, const SUnit *B)> P) {
+ for (auto A : Collection) {
+ SUnit *B = &SU;
+ if (P(A, B))
+ std::swap(A, B);
+
+ tryAddEdge(A, B);
+ }
+}
+
+void SchedGroup::link(SchedGroup &OtherGroup) {
+ for (auto B : OtherGroup.Collection)
+ link(*B);
+}
+
+bool SchedGroup::isFull() const {
+ return MaxSize && Collection.size() >= *MaxSize;
+}
+
+bool SchedGroup::canAddSU(SUnit &SU) const {
+ MachineInstr &MI = *SU.getInstr();
+ if (MI.getOpcode() != TargetOpcode::BUNDLE)
+ return canAddMI(MI);
+
+ // Special case for bundled MIs.
+ const MachineBasicBlock *MBB = MI.getParent();
+ MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
+ while (E != MBB->end() && E->isBundledWithPred())
+ ++E;
+
+ // Return true if all of the bundled MIs can be added to this group.
+ return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
+}
+
+void SchedGroup::initSchedGroup() {
+ for (auto &SU : DAG->SUnits) {
+ if (isFull())
+ break;
+
+ if (canAddSU(SU))
+ add(SU);
+ }
+}
+
+static bool canFitIntoPipeline(SUnit &SU, ScheduleDAGInstrs *DAG,
+ DenseSet<SUnit *> &ConflictedInstrs) {
+ return std::all_of(
+ ConflictedInstrs.begin(), ConflictedInstrs.end(),
+ [DAG, &SU](SUnit *SuccSU) { return DAG->canAddEdge(SuccSU, &SU); });
+}
+
+void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
+ DenseSet<SUnit *> &ConflictedInstrs) {
+ SUnit &InitSU = *RIter;
+ for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) {
+ auto &SU = *RIter;
+ if (isFull())
+ break;
+
+ if (canAddSU(SU) && !ConflictedInstrs.count(&SU) &&
+ canFitIntoPipeline(SU, DAG, ConflictedInstrs)) {
+ add(SU);
+ ConflictedInstrs.insert(&SU);
+ tryAddEdge(&SU, &InitSU);
+ }
+ }
+
+ add(InitSU);
+ assert(MaxSize);
+ (*MaxSize)++;
+}
+
+// Create a pipeline from the SchedGroups in PipelineOrderGroups such that we
+// try to enforce the relative ordering of instructions in each group.
+static void makePipeline(SmallVectorImpl<SchedGroup> &PipelineOrderGroups) {
+ auto I = PipelineOrderGroups.begin();
+ auto E = PipelineOrderGroups.end();
+ for (; I != E; ++I) {
+ auto &GroupA = *I;
+ for (auto J = std::next(I); J != E; ++J) {
+ auto &GroupB = *J;
+ GroupA.link(GroupB);
+ }
+ }
+}
+
+// Same as makePipeline but with reverse ordering.
+static void
+makeReversePipeline(SmallVectorImpl<SchedGroup> &PipelineOrderGroups) {
+ auto I = PipelineOrderGroups.rbegin();
+ auto E = PipelineOrderGroups.rend();
+ for (; I != E; ++I) {
+ auto &GroupA = *I;
+ for (auto J = std::next(I); J != E; ++J) {
+ auto &GroupB = *J;
+ GroupA.link(GroupB);
+ }
+ }
+}
+
void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
// present ordering, we will try to make each VMEMRead instruction
// a predecessor of each DSRead instruction, and so on.
SmallVector<SchedGroup, 4> PipelineOrderGroups = {
- SchedGroup(isVMEMSGMember, VMEMGroupMaxSize, DAG),
- SchedGroup(isDSReadSGMember, LDRGroupMaxSize, DAG),
- SchedGroup(isMFMASGMember, MFMAGroupMaxSize, DAG),
- SchedGroup(isDSWriteSGMember, LDWGroupMaxSize, DAG)};
-
- for (SUnit &SU : DAG->SUnits) {
- LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU));
- for (auto &SG : PipelineOrderGroups)
- if (SG.canAddSU(SU, TII))
- SG.add(SU);
- }
+ SchedGroup(SchedGroupMask::VMEM, VMEMGroupMaxSize, DAG, TII),
+ SchedGroup(SchedGroupMask::DS_READ, LDRGroupMaxSize, DAG, TII),
+ SchedGroup(SchedGroupMask::MFMA, MFMAGroupMaxSize, DAG, TII),
+ SchedGroup(SchedGroupMask::DS_WRITE, LDWGroupMaxSize, DAG, TII)};
- for (unsigned i = 0; i < PipelineOrderGroups.size() - 1; i++) {
- auto &GroupA = PipelineOrderGroups[i];
- for (unsigned j = i + 1; j < PipelineOrderGroups.size(); j++) {
- auto &GroupB = PipelineOrderGroups[j];
- GroupA.link(GroupB);
- }
- }
+ for (auto &SG : PipelineOrderGroups)
+ SG.initSchedGroup();
+
+ makePipeline(PipelineOrderGroups);
+}
+
+// Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
+static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
+ assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||
+ SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
+
+ while (!SU.Preds.empty())
+ for (auto &P : SU.Preds)
+ SU.removePred(P);
+
+ while (!SU.Succs.empty())
+ for (auto &S : SU.Succs)
+ for (auto &SP : S.getSUnit()->Preds)
+ if (SP.getSUnit() == &SU)
+ S.getSUnit()->removePred(SP);
}
void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
return;
LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n");
-
const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
- for (auto &SU : DAG->SUnits)
- if (SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER)
- addSchedBarrierEdges(SU);
+ SyncedInstrsMap.clear();
+ SyncedSchedGroupsMap.clear();
+ for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
+ if (R->getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER)
+ addSchedBarrierEdges(*R);
+
+ else if (R->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
+ initSchedGroupBarrier(R);
+ }
+
+ // SCHED_GROUP_BARRIER edges can only be added after we have found and
+ // initialized all of the SCHED_GROUP_BARRIER SchedGroups.
+ addSchedGroupBarrierEdges();
}
void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
// Remove all existing edges from the SCHED_BARRIER that were added due to the
// instruction having side effects.
- resetSchedBarrierEdges(SchedBarrier);
- SmallVector<SchedGroup *, 4> SchedGroups;
- int32_t Mask = MI.getOperand(0).getImm();
- getSchedGroupsFromMask(Mask, SchedGroups);
- for (auto SG : SchedGroups)
- SG->link(
- SchedBarrier, (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
- const SUnit *A, const SUnit *B) {
- return A->NodeNum > B->NodeNum;
- });
+ resetEdges(SchedBarrier, DAG);
+ auto InvertedMask =
+ invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
+ SchedGroup SG(InvertedMask, None, DAG, TII);
+ SG.initSchedGroup();
+ // Preserve original instruction ordering relative to the SCHED_BARRIER.
+ SG.link(
+ SchedBarrier,
+ (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
+ const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });
}
-void SchedBarrierDAGMutation::getSchedGroupsFromMask(
- int32_t Mask, SmallVectorImpl<SchedGroup *> &SchedGroups) {
- SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask;
- // See IntrinsicsAMDGPU.td for an explanation of these masks and their
- // mappings.
- //
- if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE &&
- (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
- if (!VALUSchedGroup) {
- VALUSchedGroup = std::make_unique<SchedGroup>(isVALUSGMember, None, DAG);
- initSchedGroup(VALUSchedGroup.get());
- }
-
- SchedGroups.push_back(VALUSchedGroup.get());
- }
-
- if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE &&
- (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
- if (!SALUSchedGroup) {
- SALUSchedGroup = std::make_unique<SchedGroup>(isSALUSGMember, None, DAG);
- initSchedGroup(SALUSchedGroup.get());
- }
-
- SchedGroups.push_back(SALUSchedGroup.get());
- }
-
- if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE &&
- (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
- if (!MFMASchedGroup) {
- MFMASchedGroup = std::make_unique<SchedGroup>(isMFMASGMember, None, DAG);
- initSchedGroup(MFMASchedGroup.get());
- }
-
- SchedGroups.push_back(MFMASchedGroup.get());
- }
-
- if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE &&
- (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
- if (!VMEMReadSchedGroup) {
- VMEMReadSchedGroup =
- std::make_unique<SchedGroup>(isVMEMReadSGMember, None, DAG);
- initSchedGroup(VMEMReadSchedGroup.get());
- }
-
- SchedGroups.push_back(VMEMReadSchedGroup.get());
- }
-
- if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE &&
- (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
- if (!VMEMWriteSchedGroup) {
- VMEMWriteSchedGroup =
- std::make_unique<SchedGroup>(isVMEMWriteSGMember, None, DAG);
- initSchedGroup(VMEMWriteSchedGroup.get());
- }
-
- SchedGroups.push_back(VMEMWriteSchedGroup.get());
- }
-
- if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE &&
- (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
- if (!DSReadSchedGroup) {
- DSReadSchedGroup =
- std::make_unique<SchedGroup>(isDSReadSGMember, None, DAG);
- initSchedGroup(DSReadSchedGroup.get());
- }
-
- SchedGroups.push_back(DSReadSchedGroup.get());
- }
-
- if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE &&
- (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
- if (!DSWriteSchedGroup) {
- DSWriteSchedGroup =
- std::make_unique<SchedGroup>(isDSWriteSGMember, None, DAG);
- initSchedGroup(DSWriteSchedGroup.get());
- }
-
- SchedGroups.push_back(DSWriteSchedGroup.get());
- }
+SchedGroupMask
+SchedBarrierDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
+ // Invert mask and erase bits for types of instructions that are implied to be
+ // allowed past the SCHED_BARRIER.
+ SchedGroupMask InvertedMask = ~Mask;
+
+ // ALU implies VALU, SALU, MFMA.
+ if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
+ InvertedMask &=
+ ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA;
+ // VALU, SALU, MFMA implies ALU.
+ else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
+ (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
+ (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE)
+ InvertedMask &= ~SchedGroupMask::ALU;
+
+ // VMEM implies VMEM_READ, VMEM_WRITE.
+ if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
+ InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
+ // VMEM_READ, VMEM_WRITE implies VMEM.
+ else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
+ (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
+ InvertedMask &= ~SchedGroupMask::VMEM;
+
+ // DS implies DS_READ, DS_WRITE.
+ if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
+ InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
+ // DS_READ, DS_WRITE implies DS.
+ else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
+ (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
+ InvertedMask &= ~SchedGroupMask::DS;
+
+ return InvertedMask;
}
-void SchedBarrierDAGMutation::initSchedGroup(SchedGroup *SG) {
- assert(SG);
- for (auto &SU : DAG->SUnits)
- if (SG->canAddSU(SU, TII))
- SG->add(SU);
+void SchedBarrierDAGMutation::initSchedGroupBarrier(
+ std::vector<SUnit>::reverse_iterator RIter) {
+ // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
+ // to the instruction having side effects.
+ resetEdges(*RIter, DAG);
+ MachineInstr &SGB = *RIter->getInstr();
+ assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
+ int32_t SGMask = SGB.getOperand(0).getImm();
+ int32_t Size = SGB.getOperand(1).getImm();
+ int32_t SyncID = SGB.getOperand(2).getImm();
+ // Create a new SchedGroup and add it to a list that is mapped to the SyncID.
+ // SchedGroups only enforce ordering between SchedGroups with the same SyncID.
+ auto &SG = SyncedSchedGroupsMap[SyncID].emplace_back((SchedGroupMask)SGMask,
+ Size, SyncID, DAG, TII);
+
+ // SyncedInstrsMap is used here is used to avoid adding the same SUs in
+ // multiple SchedGroups that have the same SyncID. This only matters for
+ // SCHED_GROUP_BARRIER and not SCHED_BARRIER.
+ SG.initSchedGroup(RIter, SyncedInstrsMap[SG.getSyncID()]);
}
-void SchedBarrierDAGMutation::resetSchedBarrierEdges(SUnit &SU) {
- assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER);
- for (auto &P : SU.Preds)
- SU.removePred(P);
-
- for (auto &S : SU.Succs) {
- for (auto &SP : S.getSUnit()->Preds) {
- if (SP.getSUnit() == &SU) {
- S.getSUnit()->removePred(SP);
- }
- }
- }
+void SchedBarrierDAGMutation::addSchedGroupBarrierEdges() {
+ // Since we traversed the DAG in reverse order when initializing
+ // SCHED_GROUP_BARRIERs we need to reverse the order in the vector to maintain
+ // user intentions and program order.
+ for (auto &SchedGroups : SyncedSchedGroupsMap)
+ makeReversePipeline(SchedGroups.second);
}
} // namespace
return;
}
+ if (MI->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) {
+ if (isVerbose()) {
+ std::string HexString;
+ raw_string_ostream HexStream(HexString);
+ HexStream << format_hex(MI->getOperand(0).getImm(), 10, true);
+ OutStreamer->emitRawComment(
+ " sched_group_barrier mask(" + HexString + ") size(" +
+ Twine(MI->getOperand(1).getImm()) + ") SyncID(" +
+ Twine(MI->getOperand(2).getImm()) + ")");
+ }
+ return;
+ }
+
if (MI->getOpcode() == AMDGPU::SI_MASKED_UNREACHABLE) {
if (isVerbose())
OutStreamer->emitRawComment(" divergent unreachable");
let isMeta = 1;
}
+def SCHED_GROUP_BARRIER : SPseudoInstSI<
+ (outs),
+ (ins i32imm:$mask, i32imm:$size, i32imm:$syncid),
+ [(int_amdgcn_sched_group_barrier (i32 timm:$mask), (i32 timm:$size), (i32 timm:$syncid))]> {
+ let SchedRW = [];
+ let hasNoSchedulingInfo = 1;
+ let hasSideEffects = 1;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let isConvergent = 1;
+ let FixedSize = 1;
+ let Size = 0;
+}
+
// SI pseudo instructions. These are used by the CFG structurizer pass
// and should be lowered to ISA instructions prior to codegen.
case Intrinsic::amdgcn_s_barrier:
case Intrinsic::amdgcn_wave_barrier:
case Intrinsic::amdgcn_sched_barrier:
+ case Intrinsic::amdgcn_sched_group_barrier:
return false;
default:
break;
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s
+
+define amdgpu_kernel void @test_sched_group_barrier() #0 {
+; GCN-LABEL: test_sched_group_barrier:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: ; sched_group_barrier mask(0x00000000) size(1) SyncID(2)
+; GCN-NEXT: ; sched_group_barrier mask(0x00000001) size(2) SyncID(4)
+; GCN-NEXT: ; sched_group_barrier mask(0x00000004) size(8) SyncID(16)
+; GCN-NEXT: ; sched_group_barrier mask(0x0000000F) size(10000) SyncID(-1)
+; GCN-NEXT: s_endpgm
+entry:
+ call void @llvm.amdgcn.sched.group.barrier(i32 0, i32 1, i32 2) #1
+ call void @llvm.amdgcn.sched.group.barrier(i32 1, i32 2, i32 4) #1
+ call void @llvm.amdgcn.sched.group.barrier(i32 4, i32 8, i32 16) #1
+ call void @llvm.amdgcn.sched.group.barrier(i32 15, i32 10000, i32 -1) #1
+ ret void
+}
+
+define amdgpu_kernel void @test_sched_group_barrier_simple_pipeline(<32 x i32> addrspace(1)* noalias %in, <32 x i32> addrspace(1)* noalias %out) {
+; GCN-LABEL: test_sched_group_barrier_simple_pipeline:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3]
+; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[2:3] offset:16
+; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[2:3] offset:32
+; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[2:3] offset:48
+; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[2:3] offset:64
+; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[2:3] offset:80
+; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[2:3] offset:96
+; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[2:3] offset:112
+; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(8) SyncID(0)
+; GCN-NEXT: s_waitcnt vmcnt(7)
+; GCN-NEXT: v_mul_lo_u32 v3, v3, v3
+; GCN-NEXT: v_mul_lo_u32 v2, v2, v2
+; GCN-NEXT: v_mul_lo_u32 v1, v1, v1
+; GCN-NEXT: v_mul_lo_u32 v0, v0, v0
+; GCN-NEXT: s_waitcnt vmcnt(6)
+; GCN-NEXT: v_mul_lo_u32 v7, v7, v7
+; GCN-NEXT: v_mul_lo_u32 v6, v6, v6
+; GCN-NEXT: v_mul_lo_u32 v5, v5, v5
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_lo_u32 v31, v31, v31
+; GCN-NEXT: v_mul_lo_u32 v30, v30, v30
+; GCN-NEXT: v_mul_lo_u32 v29, v29, v29
+; GCN-NEXT: v_mul_lo_u32 v28, v28, v28
+; GCN-NEXT: v_mul_lo_u32 v4, v4, v4
+; GCN-NEXT: v_mul_lo_u32 v11, v11, v11
+; GCN-NEXT: v_mul_lo_u32 v10, v10, v10
+; GCN-NEXT: v_mul_lo_u32 v9, v9, v9
+; GCN-NEXT: v_mul_lo_u32 v8, v8, v8
+; GCN-NEXT: v_mul_lo_u32 v15, v15, v15
+; GCN-NEXT: v_mul_lo_u32 v14, v14, v14
+; GCN-NEXT: v_mul_lo_u32 v13, v13, v13
+; GCN-NEXT: v_mul_lo_u32 v12, v12, v12
+; GCN-NEXT: v_mul_lo_u32 v19, v19, v19
+; GCN-NEXT: v_mul_lo_u32 v18, v18, v18
+; GCN-NEXT: v_mul_lo_u32 v17, v17, v17
+; GCN-NEXT: v_mul_lo_u32 v16, v16, v16
+; GCN-NEXT: v_mul_lo_u32 v23, v23, v23
+; GCN-NEXT: v_mul_lo_u32 v22, v22, v22
+; GCN-NEXT: v_mul_lo_u32 v21, v21, v21
+; GCN-NEXT: v_mul_lo_u32 v20, v20, v20
+; GCN-NEXT: v_mul_lo_u32 v27, v27, v27
+; GCN-NEXT: v_mul_lo_u32 v26, v26, v26
+; GCN-NEXT: v_mul_lo_u32 v25, v25, v25
+; GCN-NEXT: v_mul_lo_u32 v24, v24, v24
+; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(30) SyncID(0)
+; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(8) SyncID(0)
+; GCN-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #2
+ %gep1 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %in, i32 %tid
+ %load = load <32 x i32>, <32 x i32> addrspace(1)* %gep1
+ %mul = mul <32 x i32> %load, %load
+ %gep2 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %out, i32 %tid
+ store <32 x i32> %mul, <32 x i32> addrspace(1)* %gep2
+ ; VMEM read
+ call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 8, i32 0)
+ ; VALU
+ call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 30, i32 0)
+ ; VMEM write
+ call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 8, i32 0)
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone speculatable }
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -misched-cluster=false -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
+
+--- |
+ define amdgpu_kernel void @no_sched_group_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_group_barrier_2_VMEM_1000_ALU_5_MFMA_2_VMEM_WRITE(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+ define amdgpu_kernel void @sched_group_barrier_MFMA_VALU_and_SALU_alternating(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+
+ !0 = distinct !{!0}
+ !1 = !{!1, !0}
+...
+
+---
+name: no_sched_group_barrier
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: no_sched_group_barrier
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]]
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:areg_128 = IMPLICIT_DEF
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+ %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+ S_NOP 0
+ %7:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %2, 0, 0, 0, implicit $mode, implicit $exec
+ %8:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %7, 0, 0, 0, implicit $mode, implicit $exec
+ %9:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %8, 0, 0, 0, implicit $mode, implicit $exec
+ %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %9, 0, 0, 0, implicit $mode, implicit $exec
+ %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %10, 0, 0, 0, implicit $mode, implicit $exec
+ %12:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %13:vgpr_32 = nsw V_MUL_LO_U32_e64 %12, %12, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %13, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ S_ENDPGM 0, implicit %5, implicit %6, implicit %11
+...
+
+---
+name: sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 0
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 8, 5, 0
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 3, 0
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_2]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 64, 2, 0
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_3]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]]
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:areg_128 = IMPLICIT_DEF
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+ %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+ S_NOP 0
+ %7:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %2, 0, 0, 0, implicit $mode, implicit $exec
+ %8:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %7, 0, 0, 0, implicit $mode, implicit $exec
+ %9:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %8, 0, 0, 0, implicit $mode, implicit $exec
+ %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %9, 0, 0, 0, implicit $mode, implicit $exec
+ %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %10, 0, 0, 0, implicit $mode, implicit $exec
+ %12:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %13:vgpr_32 = nsw V_MUL_LO_U32_e64 %12, %12, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %13, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; 1 VMEM_READ
+ SCHED_GROUP_BARRIER 32, 1, 0
+ ; 1 VALU
+ SCHED_GROUP_BARRIER 2, 1, 0
+ ; 5 MFMA
+ SCHED_GROUP_BARRIER 8, 5, 0
+ ; 1 VMEM_READ
+ SCHED_GROUP_BARRIER 32, 1, 0
+ ; 3 VALU
+ SCHED_GROUP_BARRIER 2, 3, 0
+ ; 2 VMEM_WRITE
+ SCHED_GROUP_BARRIER 64, 2, 0
+ S_ENDPGM 0, implicit %5, implicit %6, implicit %11
+...
+
+---
+name: sched_group_barrier_2_VMEM_1000_ALU_5_MFMA_2_VMEM_WRITE
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_group_barrier_2_VMEM_1000_ALU_5_MFMA_2_VMEM_WRITE
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 0
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 1, 10, 0
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 8, 5, 0
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 64, 2, 0
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]]
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:areg_128 = IMPLICIT_DEF
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+ %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+ S_NOP 0
+ %7:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %2, 0, 0, 0, implicit $mode, implicit $exec
+ %8:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %7, 0, 0, 0, implicit $mode, implicit $exec
+ %9:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %8, 0, 0, 0, implicit $mode, implicit $exec
+ %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %9, 0, 0, 0, implicit $mode, implicit $exec
+ %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %10, 0, 0, 0, implicit $mode, implicit $exec
+ %12:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %13:vgpr_32 = nsw V_MUL_LO_U32_e64 %12, %12, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %13, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; 2 VMEM
+ SCHED_GROUP_BARRIER 16, 2, 0
+ ; 10 ALU
+ SCHED_GROUP_BARRIER 1, 10, 0
+ ; 5 MFMA
+ SCHED_GROUP_BARRIER 8, 5, 0
+ ; 2 VMEM_WRITE
+ SCHED_GROUP_BARRIER 64, 2, 0
+ S_ENDPGM 0, implicit %5, implicit %6, implicit %11
+...
+
+---
+name: sched_group_barrier_MFMA_VALU_and_SALU_alternating
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_group_barrier_MFMA_VALU_and_SALU_alternating
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 0
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 6, 1, 0
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 6, 1, 0
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 6, 1, 0
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 6, 1, 0
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 6, 1, 0
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 64, 2, 0
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]]
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:areg_128 = IMPLICIT_DEF
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+ %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec
+ S_NOP 0
+ %7:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %2, 0, 0, 0, implicit $mode, implicit $exec
+ %8:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %7, 0, 0, 0, implicit $mode, implicit $exec
+ %9:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %8, 0, 0, 0, implicit $mode, implicit $exec
+ %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %9, 0, 0, 0, implicit $mode, implicit $exec
+ %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %10, 0, 0, 0, implicit $mode, implicit $exec
+ %12:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+ %13:vgpr_32 = nsw V_MUL_LO_U32_e64 %12, %12, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR %1, %13, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; 2 VMEM
+ SCHED_GROUP_BARRIER 16, 2, 0
+ ; 1 VALU+SALU
+ SCHED_GROUP_BARRIER 8, 1, 0
+ ; 1 MFMA
+ SCHED_GROUP_BARRIER 6, 1, 0
+ ; 1 VALU+SALU
+ SCHED_GROUP_BARRIER 8, 1, 0
+ ; 1 MFMA
+ SCHED_GROUP_BARRIER 6, 1, 0
+ ; 1 VALU+SALU
+ SCHED_GROUP_BARRIER 8, 1, 0
+ ; 1 MFMA
+ SCHED_GROUP_BARRIER 6, 1, 0
+ ; 1 VALU+SALU
+ SCHED_GROUP_BARRIER 8, 1, 0
+ ; 1 MFMA
+ SCHED_GROUP_BARRIER 6, 1, 0
+ ; 1 VALU+SALU
+ SCHED_GROUP_BARRIER 8, 1, 0
+ ; 1 MFMA
+ SCHED_GROUP_BARRIER 6, 1, 0
+ ; 2 VMEM_WRITE
+ SCHED_GROUP_BARRIER 64, 2, 0
+ S_ENDPGM 0, implicit %5, implicit %6, implicit %11
+...