From 408e300933f2c5e8aaffb2539e47b89a2112b81b Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@codeaurora.org>
Date: Fri, 15 Jul 2016 21:34:02 +0000
Subject: [PATCH] [Hexagon] Handle instruction latency for 0 or 2 cycles

The Hexagon schedulers need to handle instructions with a latency
of 0 or 2 more accurately. The problem, in v60, is that a dependence
between two instructions with a 2 cycle latency can use a .cur version
of the source to achieve a 0 cycle latency when the use is in the
same packet. Any othe use, must be at least 2 packets later, or a
stall occurs. In other words, the compiler does not want to schedule
the dependent instructions 1 cycle later.

To achieve this, the latency adjustment code allows only a single
dependence to have a zero latency. All other instructions have the
other value, which is typically 2 cycles. We use a heuristic to
determine which instruction gets the 0 latency.

The Hexagon machine scheduler was also changed to increase the cost
associated with 0 latency dependences than can be scheduled in the
same packet.

Patch by Brendon Cahoon.

llvm-svn: 275625
---
 llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp       |  32 +++++
 llvm/lib/Target/Hexagon/HexagonInstrInfo.h         |   4 +
 .../lib/Target/Hexagon/HexagonMachineScheduler.cpp |  22 +++
 llvm/lib/Target/Hexagon/HexagonSubtarget.cpp       | 159 +++++++++++++++++++++
 llvm/lib/Target/Hexagon/HexagonSubtarget.h         |  10 ++
 5 files changed, 227 insertions(+)

diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 72cffe2..fe9f97d 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -2515,6 +2515,28 @@ bool HexagonInstrInfo::isTC4x(const MachineInstr *MI) const {
 }
 
 
+// Schedule this ASAP.
+bool HexagonInstrInfo::isToBeScheduledASAP(const MachineInstr *MI1,
+      const MachineInstr *MI2) const {
+  if (!MI1 || !MI2)
+    return false;
+  if (mayBeCurLoad(MI1)) {
+    // if (result of SU is used in Next) return true;
+    unsigned DstReg = MI1->getOperand(0).getReg();
+    int N = MI2->getNumOperands();
+    for (int I = 0; I < N; I++)
+      if (MI2->getOperand(I).isReg() && DstReg == MI2->getOperand(I).getReg())
+        return true;
+  }
+  if (mayBeNewStore(MI2))
+    if (MI2->getOpcode() == Hexagon::V6_vS32b_pi)
+      if (MI1->getOperand(0).isReg() && MI2->getOperand(3).isReg() &&
+          MI1->getOperand(0).getReg() == MI2->getOperand(3).getReg())
+        return true;
+  return false;
+}
+
+
 bool HexagonInstrInfo::isV60VectorInstruction(const MachineInstr *MI) const {
   if (!MI)
     return false;
@@ -2839,6 +2861,16 @@ bool HexagonInstrInfo::isZeroExtendingLoad(const MachineInstr &MI) const {
 }
 
 
+// Add latency to instruction.
+bool HexagonInstrInfo::addLatencyToSchedule(const MachineInstr *MI1,
+      const MachineInstr *MI2) const {
+  if (isV60VectorInstruction(MI1) && isV60VectorInstruction(MI2))
+    if (!isVecUsableNextPacket(MI1, MI2))
+      return true;
+  return false;
+}
+
+
 /// \brief Can these instructions execute at the same time in a bundle.
 bool HexagonInstrInfo::canExecuteInBundle(const MachineInstr *First,
       const MachineInstr *Second) const {
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index 74cbc62..66b6883 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -314,6 +314,8 @@ public:
   bool isTC2(const MachineInstr *MI) const;
   bool isTC2Early(const MachineInstr *MI) const;
   bool isTC4x(const MachineInstr *MI) const;
+  bool isToBeScheduledASAP(const MachineInstr *MI1,
+                           const MachineInstr *MI2) const;
   bool isV60VectorInstruction(const MachineInstr *MI) const;
   bool isValidAutoIncImm(const EVT VT, const int Offset) const;
   bool isValidOffset(unsigned Opcode, int Offset, bool Extend = true) const;
@@ -323,6 +325,8 @@ public:
                              const MachineInstr *ConsMI) const;
   bool isZeroExtendingLoad(const MachineInstr &MI) const;
 
+  bool addLatencyToSchedule(const MachineInstr *MI1,
+                            const MachineInstr *MI2) const;
   bool canExecuteInBundle(const MachineInstr *First,
                           const MachineInstr *Second) const;
   bool hasEHLabel(const MachineBasicBlock *B) const;
diff --git a/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 2ff1b53..d1f0013 100644
--- a/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -609,6 +609,28 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
   auto &QST = DAG->MF.getSubtarget<HexagonSubtarget>();
   auto &QII = *QST.getInstrInfo();
 
+  // Give preference to a zero latency instruction if the dependent
+  // instruction is in the current packet.
+  if (Q.getID() == TopQID) {
+    for (const SDep &PI : SU->Preds) {
+      if (!PI.getSUnit()->getInstr()->isPseudo() && PI.isAssignedRegDep() &&
+          PI.getLatency() == 0 &&
+          Top.ResourceModel->isInPacket(PI.getSUnit())) {
+        ResCount += PriorityTwo;
+        DEBUG(if (verbose) dbgs() << "Z|");
+      }
+    }
+  } else {
+    for (const SDep &SI : SU->Succs) {
+      if (!SI.getSUnit()->getInstr()->isPseudo() && SI.isAssignedRegDep() &&
+          SI.getLatency() == 0 &&
+          Bot.ResourceModel->isInPacket(SI.getSUnit())) {
+        ResCount += PriorityTwo;
+        DEBUG(if (verbose) dbgs() << "Z|");
+      }
+    }
+  }
+
   // Give less preference to an instruction that will cause a stall with
   // an instruction in the previous packet.
   if (QII.isV60VectorInstruction(Instr)) {
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index 0771cbebb..8d0571e 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -51,6 +51,16 @@ static cl::opt<bool> EnableHexagonHVX("enable-hexagon-hvx",
   cl::Hidden, cl::ZeroOrMore, cl::init(false),
   cl::desc("Enable Hexagon Vector eXtensions"));
 
+static cl::opt<bool> EnableTCLatencySched("enable-tc-latency-sched",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false));
+
+static cl::opt<bool> EnableDotCurSched("enable-cur-sched",
+  cl::Hidden, cl::ZeroOrMore, cl::init(true),
+  cl::desc("Enable the scheduler to generate .cur"));
+
+static cl::opt<bool> EnableVecFrwdSched("enable-evec-frwd-sched",
+  cl::Hidden, cl::ZeroOrMore, cl::init(true));
+
 static cl::opt<bool> DisableHexagonMISched("disable-hexagon-misched",
   cl::Hidden, cl::ZeroOrMore, cl::init(false),
   cl::desc("Disable Hexagon MI Scheduling"));
@@ -185,3 +195,152 @@ bool HexagonSubtarget::enableSubRegLiveness() const {
   return EnableSubregLiveness;
 }
 
+// This helper function is responsible for increasing the latency only.
+void HexagonSubtarget::updateLatency(MachineInstr *SrcInst,
+      MachineInstr *DstInst, SDep &Dep) const {
+  if (!hasV60TOps())
+    return;
+
+  auto &QII = static_cast<const HexagonInstrInfo&>(*getInstrInfo());
+
+  if (EnableVecFrwdSched && QII.addLatencyToSchedule(SrcInst, DstInst)) {
+    // Vec frwd scheduling.
+    Dep.setLatency(Dep.getLatency() + 1);
+  } else if (useBSBScheduling() &&
+             QII.isLateInstrFeedsEarlyInstr(SrcInst, DstInst)) {
+    // BSB scheduling.
+    Dep.setLatency(Dep.getLatency() + 1);
+  } else if (EnableTCLatencySched) {
+    // TClass latency scheduling.
+    // Check if SrcInst produces in 2C an operand of DstInst taken in stage 2B.
+    if (QII.isTC1(SrcInst) || QII.isTC2(SrcInst))
+      if (!QII.isTC1(DstInst) && !QII.isTC2(DstInst))
+        Dep.setLatency(Dep.getLatency() + 1);
+  }
+}
+
+// Return true if these are the best two instructions to schedule
+// together with a zero latency. Only one dependence should have a zero
+// latency. If there are multiple choices, choose the best, and change
+// ther others, if needed.
+bool HexagonSubtarget::isBestZeroLatency(SUnit *Src, SUnit *Dst,
+      const HexagonInstrInfo *TII) const {
+  MachineInstr *SrcInst = Src->getInstr();
+  MachineInstr *DstInst = Dst->getInstr();
+  // Check if the instructions can be scheduled together.
+  assert((TII->isToBeScheduledASAP(SrcInst, DstInst) ||
+          TII->canExecuteInBundle(SrcInst, DstInst)) &&
+         "Unable to schedule instructions together.");
+
+  if (SrcInst->isPHI() || DstInst->isPHI())
+    return false;
+
+  // Look for the best candidate to schedule together. If there are
+  // multiple choices, then the best candidate is the one with the
+  // greatest height, i.e., longest critical path.
+  SUnit *Best = Dst;
+  SUnit *PrevBest = nullptr;
+  for (const SDep &SI : Src->Succs) {
+    if (!SI.isAssignedRegDep())
+      continue;
+    if (SI.getLatency() == 0)
+      PrevBest = SI.getSUnit();
+    MachineInstr *Inst = SI.getSUnit()->getInstr();
+    if (!TII->isToBeScheduledASAP(SrcInst, Inst) ||
+        !TII->canExecuteInBundle(SrcInst, Inst))
+      continue;
+    if (SI.getSUnit()->getHeight() > Best->getHeight())
+      Best = SI.getSUnit();
+  }
+
+  // Reassign the latency for the previous best, which requires setting
+  // the dependence edge in both directions.
+  if (Best != PrevBest) {
+    for (SDep &SI : Src->Succs) {
+      if (SI.getSUnit() != PrevBest)
+        continue;
+      SI.setLatency(1);
+      updateLatency(SrcInst, DstInst, SI);
+      // Update the latency of the predecessor edge too.
+      for (SDep &PI : PrevBest->Preds) {
+        if (PI.getSUnit() != Src || !PI.isAssignedRegDep())
+          continue;
+        PI.setLatency(1);
+        updateLatency(SrcInst, DstInst, PI);
+      }
+    }
+  }
+
+  return Best == Dst;
+}
+
+// Update the latency of a Phi when the Phi bridges two instructions that
+// require a multi-cycle latency.
+void HexagonSubtarget::changePhiLatency(MachineInstr *SrcInst, SUnit *Dst,
+      SDep &Dep) const {
+  if (!SrcInst->isPHI() || Dst->NumPreds == 0 || Dep.getLatency() != 0)
+    return;
+
+  for (const SDep &PI : Dst->Preds) {
+    if (PI.getLatency() != 0)
+      continue;
+    Dep.setLatency(2);
+    break;
+  }
+}
+
+/// \brief Perform target specific adjustments to the latency of a schedule
+/// dependency.
+void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
+                                             SDep &Dep) const {
+  MachineInstr *SrcInst = Src->getInstr();
+  MachineInstr *DstInst = Dst->getInstr();
+  if (!Src->isInstr() || !Dst->isInstr())
+    return;
+
+  const HexagonInstrInfo *QII = static_cast<const HexagonInstrInfo *>(getInstrInfo());
+
+  // Instructions with .new operands have zero latency.
+  if (QII->canExecuteInBundle(SrcInst, DstInst) &&
+      isBestZeroLatency(Src, Dst, QII)) {
+    Dep.setLatency(0);
+    return;
+  }
+
+  if (!hasV60TOps())
+    return;
+
+  // Don't adjust the latency of post-increment part of the instruction.
+  if (QII->isPostIncrement(SrcInst) && Dep.isAssignedRegDep()) {
+    if (SrcInst->mayStore())
+      return;
+    if (Dep.getReg() != SrcInst->getOperand(0).getReg())
+      return;
+  } else if (QII->isPostIncrement(DstInst) && Dep.getKind() == SDep::Anti) {
+    if (DstInst->mayStore())
+      return;
+    if (Dep.getReg() != DstInst->getOperand(0).getReg())
+      return;
+  } else if (QII->isPostIncrement(DstInst) && DstInst->mayStore() &&
+             Dep.isAssignedRegDep()) {
+    MachineOperand &Op = DstInst->getOperand(DstInst->getNumOperands() - 1);
+    if (Op.isReg() && Dep.getReg() != Op.getReg())
+      return;
+  }
+
+  // Check if we need to change any the latency values when Phis are added.
+  if (useBSBScheduling() && SrcInst->isPHI()) {
+    changePhiLatency(SrcInst, Dst, Dep);
+    return;
+  }
+
+  // Try to schedule uses near definitions to generate .cur.
+  if (EnableDotCurSched && QII->isToBeScheduledASAP(SrcInst, DstInst) &&
+      isBestZeroLatency(Src, Dst, QII)) {
+    Dep.setLatency(0);
+    return;
+  }
+
+  updateLatency(SrcInst, DstInst, Dep);
+}
+
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index 1922df1..143f1d3 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -127,6 +127,16 @@ public:
   void getPostRAMutations(
       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
       const override;
+
+  /// \brief Perform target specific adjustments to the latency of a schedule
+  /// dependency.
+  void adjustSchedDependency(SUnit *def, SUnit *use, SDep& dep) const override;
+
+private:
+  // Helper function responsible for increasing the latency only.
+  void updateLatency(MachineInstr *SrcInst, MachineInstr *DstInst, SDep &Dep) const;
+  bool isBestZeroLatency(SUnit *Src, SUnit *Dst, const HexagonInstrInfo *TII) const;
+  void changePhiLatency(MachineInstr *SrcInst, SUnit *Dst, SDep &Dep) const;
 };
 
 } // end namespace llvm
-- 
2.7.4