void apply(ScheduleDAGInstrs *DAGInstrs) override;
protected:
- void clusterNeighboringMemOps(ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG);
+ void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps,
+ ScheduleDAGInstrs *DAG);
+ void collectMemOpRecords(std::vector<SUnit> &SUnits,
+ SmallVectorImpl<MemOpInfo> &MemOpRecords);
};
class StoreClusterMutation : public BaseMemOpClusterMutation {
} // end namespace llvm
+// Sorting all the loads/stores first, then for each load/store, checking the
+// following load/store one by one, until reach the first non-dependent one and
+// call target hook to see if they can cluster.
void BaseMemOpClusterMutation::clusterNeighboringMemOps(
- ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG) {
- SmallVector<MemOpInfo, 32> MemOpRecords;
- for (SUnit *SU : MemOps) {
- const MachineInstr &MI = *SU->getInstr();
- SmallVector<const MachineOperand *, 4> BaseOps;
- int64_t Offset;
- bool OffsetIsScalable;
- unsigned Width;
- if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
- OffsetIsScalable, Width, TRI)) {
- MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset, Width));
-
- LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
- << Offset << ", OffsetIsScalable: " << OffsetIsScalable
- << ", Width: " << Width << "\n");
- }
-#ifndef NDEBUG
- for (auto *Op : BaseOps)
- assert(Op);
-#endif
- }
- if (MemOpRecords.size() < 2)
- return;
-
- llvm::sort(MemOpRecords);
+ ArrayRef<MemOpInfo> MemOpRecords, ScheduleDAGInstrs *DAG) {
+ // Keep track of the current cluster length and bytes for each SUnit.
+ DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo;
// At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
// cluster mem ops collected within `MemOpRecords` array.
- unsigned ClusterLength = 1;
- unsigned CurrentClusterBytes = MemOpRecords[0].Width;
for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
// Decision to cluster mem ops is taken based on target dependent logic
auto MemOpa = MemOpRecords[Idx];
- auto MemOpb = MemOpRecords[Idx + 1];
- ++ClusterLength;
- CurrentClusterBytes += MemOpb.Width;
- if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
- CurrentClusterBytes)) {
- // Current mem ops pair could not be clustered, reset cluster length, and
- // go to next pair
- ClusterLength = 1;
- CurrentClusterBytes = MemOpb.Width;
+
+ // Seek for the next load/store to do the cluster.
+ unsigned NextIdx = Idx + 1;
+ for (; NextIdx < End; ++NextIdx)
+ // Skip if MemOpb has been clustered already or has dependency with
+ // MemOpa.
+ if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) &&
+ !DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) &&
+ !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU))
+ break;
+ if (NextIdx == End)
continue;
+
+ auto MemOpb = MemOpRecords[NextIdx];
+ unsigned ClusterLength = 2;
+ unsigned CurrentClusterBytes = MemOpa.Width + MemOpb.Width;
+ if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) {
+ ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1;
+ CurrentClusterBytes =
+ SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width;
}
+ if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
+ CurrentClusterBytes))
+ continue;
+
SUnit *SUa = MemOpa.SU;
SUnit *SUb = MemOpb.SU;
if (SUa->NodeNum > SUb->NodeNum)
std::swap(SUa, SUb);
// FIXME: Is this check really required?
- if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
- ClusterLength = 1;
- CurrentClusterBytes = MemOpb.Width;
+ if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
continue;
- }
LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
<< SUb->NodeNum << ")\n");
}
}
+ SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength,
+ CurrentClusterBytes};
+
LLVM_DEBUG(dbgs() << " Curr cluster length: " << ClusterLength
<< ", Curr cluster bytes: " << CurrentClusterBytes
<< "\n");
}
}
-/// Callback from DAG postProcessing to create cluster edges for loads.
-void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
- // Map DAG NodeNum to a set of dependent MemOps in store chain.
- DenseMap<unsigned, SmallVector<SUnit *, 4>> StoreChains;
- for (SUnit &SU : DAG->SUnits) {
+void BaseMemOpClusterMutation::collectMemOpRecords(
+ std::vector<SUnit> &SUnits, SmallVectorImpl<MemOpInfo> &MemOpRecords) {
+ for (auto &SU : SUnits) {
if ((IsLoad && !SU.getInstr()->mayLoad()) ||
(!IsLoad && !SU.getInstr()->mayStore()))
continue;
- unsigned ChainPredID = DAG->SUnits.size();
- for (const SDep &Pred : SU.Preds) {
- // We only want to cluster the mem ops that have the same ctrl(non-data)
- // pred so that they didn't have ctrl dependency for each other. But for
- // store instrs, we can still cluster them if the pred is load instr.
- if ((Pred.isCtrl() &&
- (IsLoad ||
- (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) &&
- !Pred.isArtificial()) {
- ChainPredID = Pred.getSUnit()->NodeNum;
- break;
- }
+ const MachineInstr &MI = *SU.getInstr();
+ SmallVector<const MachineOperand *, 4> BaseOps;
+ int64_t Offset;
+ bool OffsetIsScalable;
+ unsigned Width;
+ if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
+ OffsetIsScalable, Width, TRI)) {
+ MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width));
+
+ LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
+ << Offset << ", OffsetIsScalable: " << OffsetIsScalable
+ << ", Width: " << Width << "\n");
}
- // Insert the SU to corresponding store chain.
- auto &Chain = StoreChains.FindAndConstruct(ChainPredID).second;
- Chain.push_back(&SU);
+#ifndef NDEBUG
+ for (auto *Op : BaseOps)
+ assert(Op);
+#endif
}
+}
+
+/// Callback from DAG postProcessing to create cluster edges for loads/stores.
+void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
+ // Collect all the clusterable loads/stores
+ SmallVector<MemOpInfo, 32> MemOpRecords;
+ collectMemOpRecords(DAG->SUnits, MemOpRecords);
+
+ if (MemOpRecords.size() < 2)
+ return;
+
+ // Sorting the loads/stores, so that, we can stop the cluster as early as
+ // possible.
+ llvm::sort(MemOpRecords);
- // Iterate over the store chains.
- for (auto &SCD : StoreChains)
- clusterNeighboringMemOps(SCD.second, DAG);
+ // Trying to cluster all the neighboring loads/stores.
+ clusterNeighboringMemOps(MemOpRecords, DAG);
}
//===----------------------------------------------------------------------===//
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4
; GFX9-NEXT: global_load_short_d16 v1, v0, s[0:1] offset:4
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
-; GFX9-NEXT: global_load_dword v4, v0, s[0:1]
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: global_load_dword v3, v0, s[0:1]
+; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4
+; GFX9-NEXT: global_load_dword v4, v0, s[6:7]
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_pk_max_i16 v1, v2, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_max_i16 v3, v3, v4
+; GFX9-NEXT: v_pk_max_i16 v3, v4, v3
; GFX9-NEXT: global_store_short v0, v1, s[4:5] offset:4
; GFX9-NEXT: global_store_dword v0, v3, s[4:5]
; GFX9-NEXT: s_endpgm