"Experimentally, results are mixed, so this should be set on a "
"case-by-case basis."));
-static cl::opt<bool> EnableLowerBound(
- "amdgpu-igrouplp-exact-solver-lower-bound", cl::Hidden,
- cl::desc("Whether to use a lower bound when calculating the cost "
- "for a partial fit using the exact solver. The lower bound "
- "calculates the cost of assigning the remaining instructions "
- "under idealized conditions. The LB reduces the overall search "
- "space but adds time complexity per branch explored."),
- cl::init(false));
-
// Components of the mask that determines which instruction types may be may be
// classified into a SchedGroup.
enum class SchedGroupMask {
const SIInstrInfo *TII;
- // Try to add and edge from SU A to SU B. This returns false if there is a
- // dependency which makes adding the A->B edge impossible, otherwise it
- // returns true. The result is that it will return true even if no edge was
- // added. For example, if there is already an edge between A->B, this will
- // return true, even though DAG->addEdge does not add edge.
+ // Try to add and edge from SU A to SU B.
bool tryAddEdge(SUnit *A, SUnit *B);
// Use SGMask to determine whether we can classify MI as a member of this
// Add DAG dependencies and track which edges are added, and the count of
// missed edges
int link(SUnit &SU, bool MakePred,
- SmallVectorImpl<std::pair<SUnit *, SUnit *>> &AddedEdges);
+ std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
// Add DAG dependencies from all SUnits in this SchedGroup and this SU.
// Use the predicate to determine whether SU should be a predecessor (P =
int BestCost = -1;
int CurrCost = 0;
- // A lower bound on the optimal cost for a complete pipeline
- int StaticLowerBound = 0;
-
// Index pointing to the conflicting instruction that is currently being
// fitted
int CurrConflInstNo = 0;
void populateReadyList(SUToCandSGsPair &CurrSU,
SmallVectorImpl<std::pair<int, int>> &ReadyList,
SmallVectorImpl<SchedGroup> &SyncPipeline);
- // Calculate best cost assignment of an unassigned SU without assigning it.
- // The sum of these costs across SUs represents a Lower Bound on the true best
- // cost for the set of unassigned SUs.
- int calculateLowerBound();
// Add edges corresponding to the SchedGroups as assigned by solver
void makePipeline();
// Add the edges from the SU to the other SchedGroups in pipeline, and
// return the number of edges missed.
int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
- SmallVectorImpl<std::pair<SUnit *, SUnit *>> &AddedEdges,
- int BestCost = -1);
+ std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
// Remove the edges passed via AddedEdges
- void removeEdges(SmallVectorImpl<std::pair<SUnit *, SUnit *>> &AddedEdges);
+ void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
// Convert the passed in maps to arrays for bidirectional iterators
void convertSyncMapsToArrays();
int PipelineSolver::addEdges(
SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
- SmallVectorImpl<std::pair<SUnit *, SUnit *>> &AddedEdges, int BestCost) {
+ std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
int AddedCost = 0;
bool MakePred = false;
// linked as a predecessor of the subsequent SchedGroups
auto GroupNo = (int)SyncPipeline.size() - 1;
for (; GroupNo >= 0; GroupNo--) {
- if (BestCost != -1 && AddedCost >= BestCost)
- return AddedCost;
if (SyncPipeline[GroupNo].getSGID() == SGID) {
MakePred = true;
continue;
}
void PipelineSolver::removeEdges(
- SmallVectorImpl<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
+ const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
// Only remove the edges that we have added when testing
// the fit.
for (auto &PredSuccPair : EdgesToRemove) {
SUnit *Pred = PredSuccPair.first;
SUnit *Succ = PredSuccPair.second;
- auto Match =
- std::find_if(Succ->Preds.begin(), Succ->Preds.end(), [&Pred](SDep &P) {
- return P.getSUnit() == Pred && P.isArtificial();
- });
-
+ auto Match = llvm::find_if(
+ Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
if (Match != Succ->Preds.end()) {
assert(Match->isArtificial());
Succ->removePred(*Match);
if (BestCost == -1 || CurrCost < BestCost) {
BestPipeline = CurrPipeline;
BestCost = CurrCost;
- LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << '\n');
+ LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
}
assert(BestCost >= 0);
}
if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
DoneExploring = true;
- return (DoneExploring || BestCost == StaticLowerBound);
+ return (DoneExploring || BestCost == 0);
}
void PipelineSolver::populateReadyList(
assert(CurrSU.second.size() >= 1);
auto I = CurrSU.second.rbegin();
auto E = CurrSU.second.rend();
- SmallVector<std::pair<SUnit *, SUnit *>, 16> AddedEdges;
for (; I != E; ++I) {
-
+ std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
int CandSGID = *I;
SchedGroup *Match;
for (auto &SG : SyncPipeline) {
ReadyList.push_back(std::pair(*I, MissPenalty));
continue;
}
- AddedEdges.clear();
int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
ReadyList.push_back(std::pair(*I, TempCost));
assert(ReadyList.size() == CurrSU.second.size());
}
-int PipelineSolver::calculateLowerBound() {
- if (CurrSyncGroupIdx >= (int)CurrPipeline.size())
- return 0;
- int TempConflInstNo = CurrConflInstNo;
- int TmpSyncGroupIdx = CurrSyncGroupIdx;
- int MinimumCost = 0;
- SmallVector<std::pair<SUnit *, SUnit *>, 16> AddedEdges;
-
- for (; TmpSyncGroupIdx < (int)CurrPipeline.size(); TmpSyncGroupIdx++) {
- auto SyncPipeline = CurrPipeline[TmpSyncGroupIdx];
- for (; TempConflInstNo < (int)PipelineInstrs[TmpSyncGroupIdx].size();
- TempConflInstNo++) {
- auto CurrSU = PipelineInstrs[TmpSyncGroupIdx][TempConflInstNo];
- auto I = CurrSU.second.rbegin();
- auto E = CurrSU.second.rend();
- int MinCostForSU = -1;
- for (; I != E; I++) {
- int CandSGID = *I;
- SchedGroup *Match;
- for (auto &SG : SyncPipeline) {
- if (SG.getSGID() == CandSGID)
- Match = &SG;
- }
-
- if (Match->isFull()) {
- if (MinCostForSU == -1 || MissPenalty < MinCostForSU)
- MinCostForSU = MissPenalty;
- continue;
- }
- AddedEdges.clear();
- int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID,
- AddedEdges, MinCostForSU);
- if (MinCostForSU == -1 || TempCost < MinCostForSU)
- MinCostForSU = TempCost;
-
- removeEdges(AddedEdges);
- if (MinCostForSU == 0)
- break;
- }
- MinimumCost += MinCostForSU;
- }
- TempConflInstNo = 0;
- }
- return MinimumCost;
-}
-
bool PipelineSolver::solveExact() {
if (checkOptimal())
return true;
PipelineInstrs[CurrSyncGroupIdx].size());
SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
- << ") in Pipeline # " << CurrSyncGroupIdx << '\n');
+ << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
// SchedGroup -> Cost pairs
SmallVector<std::pair<int, int>, 4> ReadyList;
// Prioritize the candidate sched groups in terms of lowest cost first
populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]);
- SmallVector<std::pair<SUnit *, SUnit *>, 16> AddedEdges;
auto I = ReadyList.begin();
auto E = ReadyList.end();
int CandSGID = I->first;
int AddedCost = 0;
+ std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
SchedGroup *Match;
for (auto &SG : SyncPipeline) {
LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
<< (int)Match->getMask() << "and ID " << CandSGID
- << '\n');
+ << "\n");
Match->add(*CurrSU.first);
- AddedEdges.clear();
AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
- LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << '\n');
+ LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
CurrCost += AddedCost;
advancePosition();
++BranchesExplored;
bool FinishedExploring = false;
// If the Cost after adding edges is greater than a known solution,
// backtrack
- int LBCost =
- (EnableLowerBound && BestCost != -1) ? calculateLowerBound() : 0;
- if (BestCost == -1 || CurrCost + LBCost < BestCost) {
+ if (CurrCost < BestCost || BestCost == -1) {
if (solveExact()) {
- FinishedExploring = BestCost != StaticLowerBound;
+ FinishedExploring = BestCost != 0;
if (!FinishedExploring)
return true;
}
bool FinishedExploring = false;
if (CurrCost < BestCost || BestCost == -1) {
if (solveExact()) {
- bool FinishedExploring = BestCost != StaticLowerBound;
+ bool FinishedExploring = BestCost != 0;
if (!FinishedExploring)
return true;
}
bool PipelineSolver::solveGreedy() {
BestCost = 0;
- SmallVector<std::pair<SUnit *, SUnit *>, 16> AddedEdges;
+ std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
int BestGroupID = -1;
auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
- << ") in Pipeline # " << CurrSyncGroupIdx << '\n');
+ << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
// Since we have added the potential SchedGroups from bottom up, but
// traversed the DAG from top down, parse over the groups from last to
auto I = CurrSU.second.rbegin();
auto E = CurrSU.second.rend();
for (; I != E; ++I) {
- SmallVector<std::pair<SUnit *, SUnit *>, 16> AddedEdges;
+ std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
int CandSGID = *I;
SchedGroup *Match;
for (auto &SG : SyncPipeline) {
}
LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
- << (int)Match->getMask() << '\n');
+ << (int)Match->getMask() << "\n");
if (Match->isFull()) {
LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
continue;
}
- TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges,
- BestNodeCost);
- LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << '\n');
+ TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
+ LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
if (TempCost < BestNodeCost || BestNodeCost == -1) {
BestGroup = Match;
BestNodeCost = TempCost;
BestGroup->add(*CurrSU.first);
addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
- << (int)BestGroup->getMask() << '\n');
+ << (int)BestGroup->getMask() << "\n");
BestCost += TempCost;
} else
BestCost += MissPenalty;
LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
solveGreedy();
reset();
- LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << '\n');
- StaticLowerBound = calculateLowerBound();
- LLVM_DEBUG(dbgs() << "Lower Bound on Pipeline Cost is " << StaticLowerBound
- << '\n');
- if (BestCost > StaticLowerBound) {
+ LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
+ if (BestCost > 0) {
LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
solveExact();
- LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << '\n');
+ LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
}
} else { // Use the Greedy Algorithm by default
LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
}
int SchedGroup::link(SUnit &SU, bool MakePred,
- SmallVectorImpl<std::pair<SUnit *, SUnit *>> &AddedEdges) {
+ std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
int MissedEdges = 0;
for (auto *A : Collection) {
SUnit *B = &SU;
if (MakePred)
std::swap(A, B);
+ if (DAG->IsReachable(B, A))
+ continue;
+ // tryAddEdge returns false if there is a dependency that makes adding
+ // the A->B edge impossible, otherwise it returns true;
bool Added = tryAddEdge(A, B);
if (Added)
AddedEdges.push_back(std::pair(A, B));
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACT %s
-; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver=1 -amdgpu-igrouplp-exact-solver-max-branches=200000 -amdgpu-igrouplp-exact-solver-cost-heur=1 < %s | FileCheck -check-prefix=LB %s
-
-define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(<32 x i32> addrspace(1)* noalias %in, <32 x i32> addrspace(1)* noalias %out) #0 {
-; EXACT-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
-; EXACT: ; %bb.0:
-; EXACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; EXACT-NEXT: v_lshlrev_b32_e32 v16, 7, v0
-; EXACT-NEXT: ; kill: killed $sgpr0_sgpr1
-; EXACT-NEXT: s_waitcnt lgkmcnt(0)
-; EXACT-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
-; EXACT-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; EXACT-NEXT: s_waitcnt vmcnt(1)
-; EXACT-NEXT: v_mul_lo_u32 v13, v13, v13
-; EXACT-NEXT: s_waitcnt vmcnt(0)
-; EXACT-NEXT: v_mul_lo_u32 v7, v7, v7
-; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
-; EXACT-NEXT: v_mul_lo_u32 v6, v6, v6
-; EXACT-NEXT: v_mul_lo_u32 v12, v12, v12
-; EXACT-NEXT: v_mul_lo_u32 v15, v15, v15
-; EXACT-NEXT: v_mul_lo_u32 v14, v14, v14
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACT-NEXT: s_waitcnt vmcnt(0)
-; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3
-; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2
-; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1
-; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0
-; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
-; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
-; EXACT-NEXT: s_waitcnt vmcnt(0)
-; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3
-; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2
-; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1
-; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0
-; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
-; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
-; EXACT-NEXT: s_waitcnt vmcnt(0)
-; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3
-; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2
-; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1
-; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0
-; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
-; EXACT-NEXT: v_mul_lo_u32 v5, v5, v5
-; EXACT-NEXT: v_mul_lo_u32 v4, v4, v4
-; EXACT-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
-; EXACT-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACT-NEXT: s_waitcnt vmcnt(0)
-; EXACT-NEXT: v_mul_lo_u32 v7, v7, v7
-; EXACT-NEXT: v_mul_lo_u32 v6, v6, v6
-; EXACT-NEXT: v_mul_lo_u32 v5, v5, v5
-; EXACT-NEXT: v_mul_lo_u32 v4, v4, v4
-; EXACT-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64
-; EXACT-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
-; EXACT-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; EXACT-NEXT: s_waitcnt vmcnt(0)
-; EXACT-NEXT: v_mul_lo_u32 v9, v9, v9
-; EXACT-NEXT: v_mul_lo_u32 v8, v8, v8
-; EXACT-NEXT: v_mul_lo_u32 v11, v11, v11
-; EXACT-NEXT: v_mul_lo_u32 v10, v10, v10
-; EXACT-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
-; EXACT-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACT-NEXT: s_waitcnt vmcnt(0)
-; EXACT-NEXT: v_mul_lo_u32 v11, v11, v11
-; EXACT-NEXT: v_mul_lo_u32 v10, v10, v10
-; EXACT-NEXT: v_mul_lo_u32 v9, v9, v9
-; EXACT-NEXT: v_mul_lo_u32 v8, v8, v8
-; EXACT-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; EXACT-NEXT: s_endpgm
-;
-; LB-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
-; LB: ; %bb.0:
-; LB-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; LB-NEXT: v_lshlrev_b32_e32 v12, 7, v0
-; LB-NEXT: s_waitcnt lgkmcnt(0)
-; LB-NEXT: global_load_dwordx4 v[8:11], v12, s[0:1] offset:64
-; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; LB-NEXT: s_waitcnt vmcnt(0)
-; LB-NEXT: v_mul_lo_u32 v11, v11, v11
-; LB-NEXT: v_mul_lo_u32 v10, v10, v10
-; LB-NEXT: v_mul_lo_u32 v9, v9, v9
-; LB-NEXT: v_mul_lo_u32 v8, v8, v8
-; LB-NEXT: global_store_dwordx4 v12, v[8:11], s[2:3] offset:64
-; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1]
-; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; LB-NEXT: s_waitcnt vmcnt(0)
-; LB-NEXT: v_mul_lo_u32 v3, v3, v3
-; LB-NEXT: v_mul_lo_u32 v2, v2, v2
-; LB-NEXT: global_load_dwordx4 v[8:11], v12, s[0:1] offset:32
-; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; LB-NEXT: s_waitcnt vmcnt(0)
-; LB-NEXT: v_mul_lo_u32 v9, v9, v9
-; LB-NEXT: v_mul_lo_u32 v8, v8, v8
-; LB-NEXT: v_mul_lo_u32 v11, v11, v11
-; LB-NEXT: v_mul_lo_u32 v10, v10, v10
-; LB-NEXT: global_store_dwordx4 v12, v[8:11], s[2:3] offset:32
-; LB-NEXT: global_load_dwordx4 v[4:7], v12, s[0:1] offset:112
-; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; LB-NEXT: s_waitcnt vmcnt(0)
-; LB-NEXT: v_mul_lo_u32 v7, v7, v7
-; LB-NEXT: v_mul_lo_u32 v6, v6, v6
-; LB-NEXT: v_mul_lo_u32 v1, v1, v1
-; LB-NEXT: v_mul_lo_u32 v0, v0, v0
-; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3]
-; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:96
-; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; LB-NEXT: s_waitcnt vmcnt(0)
-; LB-NEXT: v_mul_lo_u32 v3, v3, v3
-; LB-NEXT: v_mul_lo_u32 v2, v2, v2
-; LB-NEXT: v_mul_lo_u32 v1, v1, v1
-; LB-NEXT: v_mul_lo_u32 v0, v0, v0
-; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:96
-; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:80
-; LB-NEXT: s_waitcnt vmcnt(0)
-; LB-NEXT: v_mul_lo_u32 v3, v3, v3
-; LB-NEXT: v_mul_lo_u32 v2, v2, v2
-; LB-NEXT: v_mul_lo_u32 v1, v1, v1
-; LB-NEXT: v_mul_lo_u32 v0, v0, v0
-; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:80
-; LB-NEXT: v_mul_lo_u32 v5, v5, v5
-; LB-NEXT: v_mul_lo_u32 v4, v4, v4
-; LB-NEXT: global_store_dwordx4 v12, v[4:7], s[2:3] offset:112
-; LB-NEXT: global_load_dwordx4 v[4:7], v12, s[0:1] offset:48
-; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; LB-NEXT: s_waitcnt vmcnt(0)
-; LB-NEXT: v_mul_lo_u32 v5, v5, v5
-; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:16
-; LB-NEXT: v_mul_lo_u32 v4, v4, v4
-; LB-NEXT: s_waitcnt vmcnt(0)
-; LB-NEXT: v_mul_lo_u32 v1, v1, v1
-; LB-NEXT: v_mul_lo_u32 v0, v0, v0
-; LB-NEXT: v_mul_lo_u32 v3, v3, v3
-; LB-NEXT: v_mul_lo_u32 v2, v2, v2
-; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:16
-; LB-NEXT: v_mul_lo_u32 v7, v7, v7
-; LB-NEXT: v_mul_lo_u32 v6, v6, v6
-; LB-NEXT: global_store_dwordx4 v12, v[4:7], s[2:3] offset:48
-; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; LB-NEXT: s_endpgm
- %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
- %gep1 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %in, i32 %tid
- %load = load <32 x i32>, <32 x i32> addrspace(1)* %gep1
- %mul = mul <32 x i32> %load, %load
- %gep2 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %out, i32 %tid
- store <32 x i32> %mul, <32 x i32> addrspace(1)* %gep2
- ; 1 VMEM read
- call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
- ; 2 VALU
- call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
- ; 1 VMEM write
- call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
- ; 1 VMEM read
- call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
- ; 2 VALU
- call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
- ; 1 VMEM write
- call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
- ; 1 VMEM read
- call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
- ; 2 VALU
- call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
- ; 1 VMEM write
- call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
- ; 1 VMEM read
- call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
- ; 2 VALU
- call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
- ; 1 VMEM write
- call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
- ; 1 VMEM read
- call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
- ; 2 VALU
- call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
- ; 1 VMEM write
- call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
- ; 1 VMEM read
- call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
- ; 2 VALU
- call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
- ; 1 VMEM write
- call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
- ; 1 VMEM read
- call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
- ; 2 VALU
- call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
- ; 1 VMEM write
- call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
- ; 1 VMEM read
- call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
- ; 2 VALU
- call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
- ; 1 VMEM write
- call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
- ret void
-}
-
-declare i32 @llvm.amdgcn.workitem.id.x() #0
-declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #0
-declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #0
-
-attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" readnone speculatable}