From: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
Date: Thu, 27 Apr 2023 21:43:32 +0000 (-0700)
Subject: [AMDGPU][IGLP] Parameterize the SchedGroup processing / linking order in Solver
X-Git-Tag: upstream/17.0.6~6759
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=1721e72d6e6d0c18ac36155b1f89fd81f45994db;p=platform%2Fupstream%2Fllvm.git

[AMDGPU][IGLP] Parameterize the SchedGroup processing / linking order in Solver

Currently the PipelineSolver processes SchedGroups in bottom up manner. However, there is no compelling reason to require this. Providing the option to toggle this affords greater experimentation capability, and make usage a bit more intuitive. Importantly, it makes designing rules much easier.

Differential Revision: https://reviews.llvm.org/D149393

Change-Id: Ic4abd3408f9faa105c0eef72eab7873d46083ee4
---

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index fc0df61..adbde8e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -254,6 +254,9 @@ class PipelineSolver {
   // How many branches we have explored
   uint64_t BranchesExplored = 0;
 
+  // The direction in which we process the candidate SchedGroups per SU
+  bool IsBottomUp = 1;
+
   // Update indices to fit next conflicting instruction
   void advancePosition();
   // Recede indices to attempt to find better fit for previous conflicting
@@ -264,19 +267,35 @@ class PipelineSolver {
   bool solveExact();
   // The polynomial time algorithm which attempts to find a good fit
   bool solveGreedy();
+  // Find the best SchedGroup for the current SU using the heuristic given all
+  // current information. One step in the greedy algorithm. Templated against
+  // the SchedGroup iterator (either reverse or forward).
+  template <typename T>
+  void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I,
+                  T E);
   // Whether or not the current solution is optimal
   bool checkOptimal();
   // Populate the ready list, prioiritizing fewest missed edges first
-  void populateReadyList(SUToCandSGsPair &CurrSU,
-                         SmallVectorImpl<std::pair<int, int>> &ReadyList,
-                         SmallVectorImpl<SchedGroup> &SyncPipeline);
+  // Templated against the SchedGroup iterator (either reverse or forward).
+  template <typename T>
+  void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I,
+                         T E);
   // Add edges corresponding to the SchedGroups as assigned by solver
   void makePipeline();
+  // Link the SchedGroups in the best found pipeline.
+  // Tmplated against the SchedGroup iterator (either reverse or forward).
+  template <typename T> void linkSchedGroups(T I, T E);
   // Add the edges from the SU to the other SchedGroups in pipeline, and
   // return the number of edges missed.
   int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
                std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
-  // Remove the edges passed via AddedEdges
+  // Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
+  // returns the cost (in terms of missed pipeline edges), and tracks the edges
+  // added in \p AddedEdges
+  template <typename T>
+  int linkSUnit(SUnit *SU, int SGID,
+                std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E);
+  // Remove the edges passed via \p AddedEdges
   void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
   // Convert the passed in maps to arrays for bidirectional iterators
   void convertSyncMapsToArrays();
@@ -290,9 +309,9 @@ public:
 
   PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
                  DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
-                 ScheduleDAGMI *DAG)
+                 ScheduleDAGMI *DAG, bool IsBottomUp = 1)
       : DAG(DAG), SyncedInstrs(SyncedInstrs),
-        SyncedSchedGroups(SyncedSchedGroups) {
+        SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
 
     for (auto &PipelineInstrs : SyncedInstrs) {
       if (PipelineInstrs.second.size() > 0) {
@@ -363,14 +382,27 @@ void PipelineSolver::convertSyncMapsToArrays() {
   }
 }
 
+template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
+  for (; I != E; ++I) {
+    auto &GroupA = *I;
+    for (auto J = std::next(I); J != E; ++J) {
+      auto &GroupB = *J;
+      GroupA.link(GroupB);
+    }
+  }
+}
+
 void PipelineSolver::makePipeline() {
   // Preserve the order of barrier for subsequent SchedGroupBarrier mutations
   for (auto &SyncPipeline : BestPipeline) {
     for (auto &SG : SyncPipeline) {
+      LLVM_DEBUG(dbgs() << "Printing SchedGroups\nSchedGroup with SGID "
+                        << SG.getSGID() << " has: \n");
       SUnit *SGBarr = nullptr;
       for (auto &SU : SG.Collection) {
         if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
           SGBarr = SU;
+        LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
       }
       // Command line requested IGroupLP doesn't have SGBarr
       if (!SGBarr)
@@ -381,43 +413,47 @@ void PipelineSolver::makePipeline() {
   }
 
   for (auto &SyncPipeline : BestPipeline) {
-    auto I = SyncPipeline.rbegin();
-    auto E = SyncPipeline.rend();
-    for (; I != E; ++I) {
-      auto &GroupA = *I;
-      for (auto J = std::next(I); J != E; ++J) {
-        auto &GroupB = *J;
-        GroupA.link(GroupB);
-      }
-    }
+    IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
+               : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
   }
 }
 
-int PipelineSolver::addEdges(
-    SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
-    std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
-  int AddedCost = 0;
+template <typename T>
+int PipelineSolver::linkSUnit(
+    SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
+    T I, T E) {
   bool MakePred = false;
-
-  // The groups in the pipeline are in reverse order. Thus,
-  // by traversing them from last to first, we are traversing
-  // them in the order as they were introduced in the code. After we
-  // pass the group the SU is being assigned to, it should be
-  // linked as a predecessor of the subsequent SchedGroups
-  auto GroupNo = (int)SyncPipeline.size() - 1;
-  for (; GroupNo >= 0; GroupNo--) {
-    if (SyncPipeline[GroupNo].getSGID() == SGID) {
+  int AddedCost = 0;
+  for (; I < E; ++I) {
+    if (I->getSGID() == SGID) {
       MakePred = true;
       continue;
     }
-    auto Group = &SyncPipeline[GroupNo];
-    AddedCost += Group->link(*SU, MakePred, AddedEdges);
+    auto Group = *I;
+    AddedCost += Group.link(*SU, MakePred, AddedEdges);
     assert(AddedCost >= 0);
   }
-
   return AddedCost;
 }
 
+int PipelineSolver::addEdges(
+    SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
+    std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
+
+  // For IsBottomUp, the first SchedGroup in SyncPipeline contains the
+  // instructions that are the ultimate successors in the resultant mutation.
+  // Therefore, in such a configuration, the SchedGroups occurring before the
+  // candidate SGID are successors of the candidate SchedGroup, thus the current
+  // SU should be linked as a predecessor to SUs in those SchedGroups. The
+  // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple
+  // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using
+  // IsBottomUp (in reverse).
+  return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(),
+                                SyncPipeline.rend())
+                    : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(),
+                                SyncPipeline.end());
+}
+
 void PipelineSolver::removeEdges(
     const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
   // Only remove the edges that we have added when testing
@@ -490,12 +526,13 @@ bool PipelineSolver::checkOptimal() {
   return (DoneExploring || BestCost == 0);
 }
 
+template <typename T>
 void PipelineSolver::populateReadyList(
-    SUToCandSGsPair &CurrSU, SmallVectorImpl<std::pair<int, int>> &ReadyList,
-    SmallVectorImpl<SchedGroup> &SyncPipeline) {
+    SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) {
+  SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
+  auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
   assert(CurrSU.second.size() >= 1);
-  auto I = CurrSU.second.rbegin();
-  auto E = CurrSU.second.rend();
+
   for (; I != E; ++I) {
     std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
     int CandSGID = *I;
@@ -545,7 +582,10 @@ bool PipelineSolver::solveExact() {
   // SchedGroup -> Cost pairs
   SmallVector<std::pair<int, int>, 4> ReadyList;
   // Prioritize the candidate sched groups in terms of lowest cost first
-  populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]);
+  IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(),
+                                 CurrSU.second.rend())
+             : populateReadyList(ReadyList, CurrSU.second.begin(),
+                                 CurrSU.second.end());
 
   auto I = ReadyList.begin();
   auto E = ReadyList.end();
@@ -620,64 +660,71 @@ bool PipelineSolver::solveExact() {
   return FinishedExploring;
 }
 
-bool PipelineSolver::solveGreedy() {
-  BestCost = 0;
-  std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
+template <typename T>
+void PipelineSolver::greedyFind(
+    std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) {
+  SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
+  int BestNodeCost = -1;
+  int TempCost;
+  SchedGroup *BestGroup = nullptr;
+  int BestGroupID = -1;
+  auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
+  LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
+                    << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
 
-  while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
-    SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
-    int BestNodeCost = -1;
-    int TempCost;
-    SchedGroup *BestGroup = nullptr;
-    int BestGroupID = -1;
-    auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
-    LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
-                      << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
-
-    // Since we have added the potential SchedGroups from bottom up, but
-    // traversed the DAG from top down, parse over the groups from last to
-    // first. If we fail to do this for the greedy algorithm, the solution will
-    // likely not be good in more complex cases.
-    auto I = CurrSU.second.rbegin();
-    auto E = CurrSU.second.rend();
-    for (; I != E; ++I) {
-      std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
-      int CandSGID = *I;
-      SchedGroup *Match;
-      for (auto &SG : SyncPipeline) {
-        if (SG.getSGID() == CandSGID)
-          Match = &SG;
-      }
+  // Since we have added the potential SchedGroups from bottom up, but
+  // traversed the DAG from top down, parse over the groups from last to
+  // first. If we fail to do this for the greedy algorithm, the solution will
+  // likely not be good in more complex cases.
+  for (; I != E; ++I) {
+    std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
+    int CandSGID = *I;
+    SchedGroup *Match;
+    for (auto &SG : SyncPipeline) {
+      if (SG.getSGID() == CandSGID)
+        Match = &SG;
+    }
 
-      LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
-                        << (int)Match->getMask() << "\n");
+    LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
+                      << (int)Match->getMask() << "\n");
 
-      if (Match->isFull()) {
-        LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
-        continue;
-      }
-      TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
-      LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
-      if (TempCost < BestNodeCost || BestNodeCost == -1) {
-        BestGroup = Match;
-        BestNodeCost = TempCost;
-        BestGroupID = CandSGID;
-      }
-      removeEdges(AddedEdges);
-      if (BestNodeCost == 0)
-        break;
+    if (Match->isFull()) {
+      LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
+      continue;
     }
+    TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
+    LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
+    if (TempCost < BestNodeCost || BestNodeCost == -1) {
+      BestGroup = Match;
+      BestNodeCost = TempCost;
+      BestGroupID = CandSGID;
+    }
+    removeEdges(AddedEdges);
+    if (BestNodeCost == 0)
+      break;
+  }
 
-    if (BestGroupID != -1) {
-      BestGroup->add(*CurrSU.first);
-      addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
-      LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
-                        << (int)BestGroup->getMask() << "\n");
-      BestCost += TempCost;
-    } else
-      BestCost += MissPenalty;
+  if (BestGroupID != -1) {
+    BestGroup->add(*CurrSU.first);
+    addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
+    LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
+                      << (int)BestGroup->getMask() << "\n");
+    BestCost += TempCost;
+  } else
+    BestCost += MissPenalty;
 
-    CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
+  CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
+}
+
+bool PipelineSolver::solveGreedy() {
+  BestCost = 0;
+  std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
+
+  while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
+    SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
+    IsBottomUp
+        ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
+        : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
     advancePosition();
   }
   BestPipeline = CurrPipeline;
@@ -721,9 +768,11 @@ void PipelineSolver::solve() {
   }
 
   makePipeline();
+  LLVM_DEBUG(dbgs() << "After applying mutation\n");
+  LLVM_DEBUG(DAG->dump());
 }
 
-enum IGLPStrategyID : int { MFMASmallGemmOptID = 0 };
+enum IGLPStrategyID : int { MFMASmallGemmOptID = 0, DemoOptID = 1 };
 
 // Implement a IGLP scheduling strategy.
 class IGLPStrategy {
@@ -741,6 +790,8 @@ public:
   // Returns true if this strategy should be applied to a ScheduleDAG.
   virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
 
+  bool IsBottomUp = 1;
+
   IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
       : DAG(DAG), TII(TII) {}
 
@@ -748,6 +799,7 @@ public:
 };
 
 class MFMASmallGemmOpt final : public IGLPStrategy {
+private:
 public:
   void applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
@@ -756,7 +808,9 @@ public:
   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
 
   MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : IGLPStrategy(DAG, TII) {}
+      : IGLPStrategy(DAG, TII) {
+    IsBottomUp = 1;
+  }
 };
 
 void MFMASmallGemmOpt::applyIGLPStrategy(
@@ -781,12 +835,51 @@ void MFMASmallGemmOpt::applyIGLPStrategy(
   }
 }
 
+class DemoOpt final : public IGLPStrategy {
+private:
+public:
+  void applyIGLPStrategy(
+      DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
+
+  bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
+
+  DemoOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+      : IGLPStrategy(DAG, TII) {
+    IsBottomUp = 0;
+  }
+};
+
+void DemoOpt::applyIGLPStrategy(
+    DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
+  // Count the number of MFMA instructions.
+  unsigned MFMACount = 0;
+  for (const MachineInstr &I : *DAG)
+    if (TII->isMFMAorWMMA(I))
+      ++MFMACount;
+
+  const unsigned PipelineSyncID = 0;
+  SchedGroup *SG = nullptr;
+  for (unsigned I = 0; I < MFMACount * 3; ++I) {
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+  }
+}
+
 static std::unique_ptr<IGLPStrategy>
 createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
                    const SIInstrInfo *TII) {
   switch (ID) {
   case MFMASmallGemmOptID:
     return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
+  case DemoOptID:
+    return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
   }
 
   llvm_unreachable("Unknown IGLPStrategyID");
@@ -829,6 +922,13 @@ private:
 public:
   void apply(ScheduleDAGInstrs *DAGInstrs) override;
 
+  // The order in which the PipelineSolver should process the candidate
+  // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last
+  // created SchedGroup first, and will consider that as the ultimate
+  // predecessor group when linking. TOP_DOWN instead links and processes the
+  // first created SchedGroup first.
+  bool IsBottomUp = 1;
+
   IGroupLPDAGMutation() = default;
 };
 
@@ -908,6 +1008,7 @@ int SchedGroup::link(SUnit &SU, bool MakePred,
 
     if (DAG->IsReachable(B, A))
       continue;
+
     // tryAddEdge returns false if there is a dependency that makes adding
     // the A->B edge impossible, otherwise it returns true;
     bool Added = tryAddEdge(A, B);
@@ -1034,7 +1135,7 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
   }
 
   if (foundSB || foundIGLP) {
-    PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG);
+    PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
     // PipelineSolver performs the mutation by adding the edges it
     // determined as the best
     PS.solve();
@@ -1114,8 +1215,10 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
   IGLPStrategyID StrategyID =
       (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
   auto S = createIGLPStrategy(StrategyID, DAG, TII);
-  if (S->shouldApplyStrategy(DAG))
+  if (S->shouldApplyStrategy(DAG)) {
+    IsBottomUp = S->IsBottomUp;
     S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);
+  }
 }
 
 } // namespace
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 1100822..ff1a0c5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -147,6 +147,144 @@ entry:
   ret void
 }
 
+
+define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
+; GCN-LABEL: test_iglp_opt_rev_mfma_gemm:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, 2.0
+; GCN-NEXT:    ; iglp_opt mask(0x00000001)
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0x6000, v1
+; GCN-NEXT:    ds_read_b128 a[28:31], v2 offset:57456
+; GCN-NEXT:    ds_read_b128 a[24:27], v2 offset:57440
+; GCN-NEXT:    ds_read_b128 a[20:23], v2 offset:57424
+; GCN-NEXT:    ds_read_b128 a[16:19], v2 offset:57408
+; GCN-NEXT:    ds_read_b128 a[0:3], v2 offset:57344
+; GCN-NEXT:    ds_read_b128 a[4:7], v2 offset:57360
+; GCN-NEXT:    ds_read_b128 a[8:11], v2 offset:57376
+; GCN-NEXT:    ds_read_b128 a[12:15], v2 offset:57392
+; GCN-NEXT:    v_mov_b32_e32 v2, 1.0
+; GCN-NEXT:    ds_read_b128 a[60:63], v1 offset:49264
+; GCN-NEXT:    ds_read_b128 a[56:59], v1 offset:49248
+; GCN-NEXT:    ds_read_b128 a[52:55], v1 offset:49232
+; GCN-NEXT:    ds_read_b128 a[48:51], v1 offset:49216
+; GCN-NEXT:    ds_read_b128 a[44:47], v1 offset:49200
+; GCN-NEXT:    ds_read_b128 a[40:43], v1 offset:49184
+; GCN-NEXT:    ds_read_b128 a[36:39], v1 offset:49168
+; GCN-NEXT:    ds_read_b128 a[32:35], v1 offset:49152
+; GCN-NEXT:    s_waitcnt lgkmcnt(8)
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-NEXT:    ds_read_b128 a[156:159], v1 offset:112
+; GCN-NEXT:    ds_read_b128 a[152:155], v1 offset:96
+; GCN-NEXT:    ds_read_b128 a[68:71], v1 offset:24592
+; GCN-NEXT:    ds_read_b128 a[64:67], v1 offset:24576
+; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(4)
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
+; GCN-NEXT:    ds_read_b128 a[148:151], v1 offset:80
+; GCN-NEXT:    ds_read_b128 a[144:147], v1 offset:64
+; GCN-NEXT:    ds_read_b128 a[128:131], v1
+; GCN-NEXT:    ds_read_b128 a[132:135], v1 offset:16
+; GCN-NEXT:    ds_read_b128 a[136:139], v1 offset:32
+; GCN-NEXT:    ds_read_b128 a[140:143], v1 offset:48
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]
+; GCN-NEXT:    ds_read_b128 a[124:127], v1 offset:8304
+; GCN-NEXT:    ds_read_b128 a[120:123], v1 offset:8288
+; GCN-NEXT:    ds_read_b128 a[116:119], v1 offset:8272
+; GCN-NEXT:    ds_read_b128 a[112:115], v1 offset:8256
+; GCN-NEXT:    ds_read_b128 a[108:111], v1 offset:8240
+; GCN-NEXT:    ds_read_b128 a[104:107], v1 offset:8224
+; GCN-NEXT:    ds_read_b128 a[100:103], v1 offset:8208
+; GCN-NEXT:    ds_read_b128 a[96:99], v1 offset:8192
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]
+; GCN-NEXT:    ds_read_b128 a[92:95], v1 offset:24688
+; GCN-NEXT:    ds_read_b128 a[88:91], v1 offset:24672
+; GCN-NEXT:    ds_read_b128 a[84:87], v1 offset:24656
+; GCN-NEXT:    ds_read_b128 a[80:83], v1 offset:24640
+; GCN-NEXT:    ds_read_b128 a[76:79], v1 offset:24624
+; GCN-NEXT:    ds_read_b128 a[72:75], v1 offset:24608
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    ds_write_b128 v0, a[156:159] offset:112
+; GCN-NEXT:    ds_write_b128 v0, a[152:155] offset:96
+; GCN-NEXT:    ds_write_b128 v0, a[148:151] offset:80
+; GCN-NEXT:    ds_write_b128 v0, a[144:147] offset:64
+; GCN-NEXT:    ds_write_b128 v0, a[140:143] offset:48
+; GCN-NEXT:    ds_write_b128 v0, a[136:139] offset:32
+; GCN-NEXT:    ds_write_b128 v0, a[132:135] offset:16
+; GCN-NEXT:    ds_write_b128 v0, a[128:131]
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    s_waitcnt lgkmcnt(8)
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
+; GCN-NEXT:    ds_write_b128 v0, a[56:59] offset:24672
+; GCN-NEXT:    ds_write_b128 v0, a[60:63] offset:24688
+; GCN-NEXT:    ds_write_b128 v0, a[48:51] offset:24640
+; GCN-NEXT:    ds_write_b128 v0, a[120:123] offset:8288
+; GCN-NEXT:    ds_write_b128 v0, a[124:127] offset:8304
+; GCN-NEXT:    ds_write_b128 v0, a[112:115] offset:8256
+; GCN-NEXT:    ds_write_b128 v0, a[116:119] offset:8272
+; GCN-NEXT:    ds_write_b128 v0, a[104:107] offset:8224
+; GCN-NEXT:    ds_write_b128 v0, a[108:111] offset:8240
+; GCN-NEXT:    ds_write_b128 v0, a[96:99] offset:8192
+; GCN-NEXT:    ds_write_b128 v0, a[100:103] offset:8208
+; GCN-NEXT:    ds_write_b128 v0, a[52:55] offset:24656
+; GCN-NEXT:    ds_write_b128 v0, a[40:43] offset:24608
+; GCN-NEXT:    ds_write_b128 v0, a[44:47] offset:24624
+; GCN-NEXT:    ds_write_b128 v0, a[32:35] offset:24576
+; GCN-NEXT:    ds_write_b128 v0, a[36:39] offset:24592
+; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:32864
+; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:32880
+; GCN-NEXT:    ds_write_b128 v0, a[16:19] offset:32832
+; GCN-NEXT:    ds_write_b128 v0, a[88:91] offset:16480
+; GCN-NEXT:    ds_write_b128 v0, a[92:95] offset:16496
+; GCN-NEXT:    ds_write_b128 v0, a[80:83] offset:16448
+; GCN-NEXT:    ds_write_b128 v0, a[84:87] offset:16464
+; GCN-NEXT:    ds_write_b128 v0, a[72:75] offset:16416
+; GCN-NEXT:    ds_write_b128 v0, a[76:79] offset:16432
+; GCN-NEXT:    ds_write_b128 v0, a[64:67] offset:16384
+; GCN-NEXT:    ds_write_b128 v0, a[68:71] offset:16400
+; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:32848
+; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:32800
+; GCN-NEXT:    ds_write_b128 v0, a[12:15] offset:32816
+; GCN-NEXT:    ds_write_b128 v0, a[0:3] offset:32768
+; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:32784
+; GCN-NEXT:    s_endpgm
+entry:
+  call void @llvm.amdgcn.iglp.opt(i32 1)
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx
+  %load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr
+  %load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64
+  %load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr
+  %load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128
+  %load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr
+  %load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192
+  %load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr
+  %load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256
+  %load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr
+  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0)
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0)
+  %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0)
+  %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0)
+  %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0)
+  %store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx
+  store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr
+  %store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64
+  store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr
+  %store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128
+  store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr
+  %store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192
+  store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr
+  %store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256
+  store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr
+  ret void
+}
+
+
 declare void @llvm.amdgcn.iglp.opt(i32) #1
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1