[AArch64] Enable more load clustering in the MI Scheduler.

author Chad Rosier <mcrosier@codeaurora.org>

Fri, 18 Mar 2016 19:21:02 +0000 (19:21 +0000)

committer Chad Rosier <mcrosier@codeaurora.org>

Fri, 18 Mar 2016 19:21:02 +0000 (19:21 +0000)
author Chad Rosier <mcrosier@codeaurora.org>
Fri, 18 Mar 2016 19:21:02 +0000 (19:21 +0000)
committer Chad Rosier <mcrosier@codeaurora.org>
Fri, 18 Mar 2016 19:21:02 +0000 (19:21 +0000)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

index 7b4f5b4..19e5814 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1342,6 +1342,33 @@ bool AArch64InstrInfo::isUnscaledLdSt(MachineInstr *MI) const {
    return isUnscaledLdSt(MI->getOpcode());
  }
  
+// Is this a candidate for ld/st merging or pairing?  For example, we don't
+// touch volatiles or load/stores that have a hint to avoid pair formation.
+bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr *MI) const {
+  // If this is a volatile load/store, don't mess with it.
+  if (MI->hasOrderedMemoryRef())
+    return false;
+
+  // Make sure this is a reg+imm (as opposed to an address reloc).
+  assert(MI->getOperand(1).isReg() && "Expected a reg operand.");
+  if (!MI->getOperand(2).isImm())
+    return false;
+
+  // Can't merge/pair if the instruction modifies the base register.
+  // e.g., ldr x0, [x0]
+  unsigned BaseReg = MI->getOperand(1).getReg();
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  if (MI->modifiesRegister(BaseReg, TRI))
+    return false;
+
+  // Check if this load/store has a hint to avoid pair formation.
+  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
+  if (isLdStPairSuppressed(MI))
+    return false;
+
+  return true;
+}
+
  bool AArch64InstrInfo::getMemOpBaseRegImmOfs(
      MachineInstr *LdSt, unsigned &BaseReg, int64_t &Offset,
      const TargetRegisterInfo *TRI) const {
@@ -1359,6 +1386,14 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfs(
    case AArch64::LDRQui:
    case AArch64::LDRXui:
    case AArch64::LDRWui:
+  case AArch64::LDRSWui:
+  // Unscaled instructions.
+  case AArch64::LDURSi:
+  case AArch64::LDURDi:
+  case AArch64::LDURQi:
+  case AArch64::LDURWi:
+  case AArch64::LDURXi:
+  case AArch64::LDURSWi:
      unsigned Width;
      return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI);
    };
@@ -1429,6 +1464,7 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
      break;
    case AArch64::LDRWui:
    case AArch64::LDRSui:
+  case AArch64::LDRSWui:
    case AArch64::STRWui:
    case AArch64::STRSui:
      Scale = Width = 4;
@@ -1452,6 +1488,55 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
    return true;
  }
  
+// Scale the unscaled offsets.  Returns false if the unscaled offset can't be
+// scaled.
+static bool scaleOffset(unsigned Opc, int64_t &Offset) {
+  unsigned OffsetStride = 1;
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::LDURQi:
+    OffsetStride = 16;
+    break;
+  case AArch64::LDURXi:
+  case AArch64::LDURDi:
+    OffsetStride = 8;
+    break;
+  case AArch64::LDURWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURSWi:
+    OffsetStride = 4;
+    break;
+  }
+  // If the byte-offset isn't a multiple of the stride, we can't scale this
+  // offset.
+  if (Offset % OffsetStride != 0)
+    return false;
+
+  // Convert the byte-offset used by unscaled into an "element" offset used
+  // by the scaled pair load/store instructions.
+  Offset /= OffsetStride;
+  return true;
+}
+
+static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
+  if (FirstOpc == SecondOpc)
+    return true;
+  // We can also pair sign-ext and zero-ext instructions.
+  switch (FirstOpc) {
+  default:
+    return false;
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+    return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
+  case AArch64::LDRSWui:
+  case AArch64::LDURSWi:
+    return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
+  }
+  // These instructions can't be paired based on their opcodes.
+  return false;
+}
+
  /// Detect opportunities for ldp/stp formation.
  ///
  /// Only called for LdSt for which getMemOpBaseRegImmOfs returns true.
@@ -1461,16 +1546,35 @@ bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
    // Only cluster up to a single pair.
    if (NumLoads > 1)
      return false;
-  if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
+
+  // Can we pair these instructions based on their opcodes?
+  unsigned FirstOpc = FirstLdSt->getOpcode();
+  unsigned SecondOpc = SecondLdSt->getOpcode();
+  if (!canPairLdStOpc(FirstOpc, SecondOpc))
+    return false;
+
+  // Can't merge volatiles or load/stores that have a hint to avoid pair
+  // formation, for example.
+  if (!isCandidateToMergeOrPair(FirstLdSt) ||
+      !isCandidateToMergeOrPair(SecondLdSt))
+    return false;
+
+  // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
+  int64_t Offset1 = FirstLdSt->getOperand(2).getImm();
+  if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
      return false;
-  // getMemOpBaseRegImmOfs guarantees that oper 2 isImm.
-  unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
-  // Allow 6 bits of positive range.
-  if (Ofs1 > 64)
+
+  int64_t Offset2 = SecondLdSt->getOperand(2).getImm();
+  if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
      return false;
+
+  // Pairwise instructions have a 7-bit signed offset field.
+  if (Offset1 > 63 || Offset1 < -64)
+    return false;
+
    // The caller should already have ordered First/SecondLdSt by offset.
-  unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
-  return Ofs1 + 1 == Ofs2;
+  assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
+  return Offset1 + 1 == Offset2;
  }
  
  bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h

index fbf8f2a..d6f0523 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -93,6 +93,9 @@ public:
    /// Return true if this is an unscaled load/store.
    bool isUnscaledLdSt(MachineInstr *MI) const;
  
+  /// Return true if this is a load/store that can be potentially paired/merged.
+  bool isCandidateToMergeOrPair(MachineInstr *MI) const;
+
    /// Hint that pairing the given load or store is unprofitable.
    void suppressLdStPair(MachineInstr *MI) const;
  
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

index de401b1..196c2bc 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -146,10 +146,6 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
    mergeUpdateInsn(MachineBasicBlock::iterator I,
                    MachineBasicBlock::iterator Update, bool IsPreIdx);
  
-  // Is this a candidate for ld/st merging or pairing?  For example, we don't
-  // touch volatiles or load/stores that have a hint to avoid pair formation.
-  bool isCandidateToMergeOrPair(MachineInstr *MI);
-
    // Find and merge foldable ldr/str instructions.
    bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
  
@@ -1588,29 +1584,6 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
    return false;
  }
  
-bool AArch64LoadStoreOpt::isCandidateToMergeOrPair(MachineInstr *MI) {
-  // If this is a volatile load/store, don't mess with it.
-  if (MI->hasOrderedMemoryRef())
-    return false;
-
-  // Make sure this is a reg+imm (as opposed to an address reloc).
-  if (!getLdStOffsetOp(MI).isImm())
-    return false;
-
-  // Can't merge/pair if the instruction modifies the base register.
-  // e.g., ldr x0, [x0]
-  unsigned BaseReg = getLdStBaseOp(MI).getReg();
-  if (MI->modifiesRegister(BaseReg, TRI))
-    return false;
-
-  // Check if this load/store has a hint to avoid pair formation.
-  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
-  if (TII->isLdStPairSuppressed(MI))
-    return false;
-
-  return true;
-}
-
  // Find narrow loads that can be converted into a single wider load with
  // bitfield extract instructions.  Also merge adjacent zero stores into a wider
  // store.
@@ -1621,7 +1594,7 @@ bool AArch64LoadStoreOpt::tryToMergeLdStInst(
    MachineInstr *MI = MBBI;
    MachineBasicBlock::iterator E = MI->getParent()->end();
  
-  if (!isCandidateToMergeOrPair(MI))
+  if (!TII->isCandidateToMergeOrPair(MI))
      return false;
  
    // For promotable zero stores, the stored value should be WZR.
@@ -1653,7 +1626,7 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
    MachineInstr *MI = MBBI;
    MachineBasicBlock::iterator E = MI->getParent()->end();
  
-  if (!isCandidateToMergeOrPair(MI))
+  if (!TII->isCandidateToMergeOrPair(MI))
      return false;
  
    // Early exit if the offset is not possible to match. (6 bits of positive
diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll

new file mode 100644 (file)

index 0000000..0d659eb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -0,0 +1,99 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+
+; Test ldr clustering.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldr_int:BB#0
+; CHECK: Cluster loads SU(1) - SU(2)
+; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
+; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
+define i32 @ldr_int(i32* %a) nounwind {
+  %p1 = getelementptr inbounds i32, i32* %a, i32 1
+  %tmp1 = load i32, i32* %p1, align 2
+  %p2 = getelementptr inbounds i32, i32* %a, i32 2
+  %tmp2 = load i32, i32* %p2, align 2
+  %tmp3 = add i32 %tmp1, %tmp2
+  ret i32 %tmp3
+}
+
+; Test ldpsw clustering
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldp_sext_int:BB#0
+; CHECK: Cluster loads SU(1) - SU(2)
+; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRSWui
+; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRSWui
+define i64 @ldp_sext_int(i32* %p) nounwind {
+  %tmp = load i32, i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %tmp1 = load i32, i32* %add.ptr, align 4
+  %sexttmp = sext i32 %tmp to i64
+  %sexttmp1 = sext i32 %tmp1 to i64
+  %add = add nsw i64 %sexttmp1, %sexttmp
+  ret i64 %add
+}
+
+; Test ldur clustering.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldur_int:BB#0
+; CHECK: Cluster loads SU(2) - SU(1)
+; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDURWi
+; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDURWi
+define i32 @ldur_int(i32* %a) nounwind {
+  %p1 = getelementptr inbounds i32, i32* %a, i32 -1
+  %tmp1 = load i32, i32* %p1, align 2
+  %p2 = getelementptr inbounds i32, i32* %a, i32 -2
+  %tmp2 = load i32, i32* %p2, align 2
+  %tmp3 = add i32 %tmp1, %tmp2
+  ret i32 %tmp3
+}
+
+; Test sext + zext clustering.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldp_half_sext_zext_int:BB#0
+; CHECK: Cluster loads SU(3) - SU(4)
+; CHECK: SU(3):   %vreg{{[0-9]+}}<def> = LDRSWui
+; CHECK: SU(4):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
+define i64 @ldp_half_sext_zext_int(i64* %q, i32* %p) nounwind {
+  %tmp0 = load i64, i64* %q, align 4
+  %tmp = load i32, i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %tmp1 = load i32, i32* %add.ptr, align 4
+  %sexttmp = sext i32 %tmp to i64
+  %sexttmp1 = zext i32 %tmp1 to i64
+  %add = add nsw i64 %sexttmp1, %sexttmp
+  %add1 = add nsw i64 %add, %tmp0
+  ret i64 %add1
+}
+
+; Test zext + sext clustering.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldp_half_zext_sext_int:BB#0
+; CHECK: Cluster loads SU(3) - SU(4)
+; CHECK: SU(3):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
+; CHECK: SU(4):   %vreg{{[0-9]+}}<def> = LDRSWui
+define i64 @ldp_half_zext_sext_int(i64* %q, i32* %p) nounwind {
+  %tmp0 = load i64, i64* %q, align 4
+  %tmp = load i32, i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %tmp1 = load i32, i32* %add.ptr, align 4
+  %sexttmp = zext i32 %tmp to i64
+  %sexttmp1 = sext i32 %tmp1 to i64
+  %add = add nsw i64 %sexttmp1, %sexttmp
+  %add1 = add nsw i64 %add, %tmp0
+  ret i64 %add1
+}
+
+; Verify we don't cluster volatile loads.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldr_int_volatile:BB#0
+; CHECK-NOT: Cluster loads
+; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
+; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
+define i32 @ldr_int_volatile(i32* %a) nounwind {
+  %p1 = getelementptr inbounds i32, i32* %a, i32 1
+  %tmp1 = load volatile i32, i32* %p1, align 2
+  %p2 = getelementptr inbounds i32, i32* %a, i32 2
+  %tmp2 = load volatile i32, i32* %p2, align 2
+  %tmp3 = add i32 %tmp1, %tmp2
+  ret i32 %tmp3
+}
author	Chad Rosier <mcrosier@codeaurora.org>
	Fri, 18 Mar 2016 19:21:02 +0000 (19:21 +0000)
committer	Chad Rosier <mcrosier@codeaurora.org>
	Fri, 18 Mar 2016 19:21:02 +0000 (19:21 +0000)
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp		patch \| blob \| history
llvm/lib/Target/AArch64/AArch64InstrInfo.h		patch \| blob \| history
llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp		patch \| blob \| history
llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll	[new file with mode: 0644]	patch \| blob