From 97ca7c2cc9083ebde681b0e11f7a8ccae1966d64 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 11 Dec 2019 10:29:23 +0000
Subject: [PATCH] [AArch64] Enable clustering memory accesses to fixed stack
 objects

Summary:
r347747 added support for clustering mem ops with FI base operands
including support for fixed stack objects in shouldClusterFI, but
apparently this was never tested.

This patch fixes shouldClusterFI to work with scaled as well as
unscaled load/store instructions, and fixes the ordering of memory ops
in MemOpInfo::operator< to ensure that memory addresses always
increase, regardless of which direction the stack grows.

Subscribers: MatzeB, kristof.beyls, hiraditya, javed.absar, arphaman, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D71334
---
 llvm/lib/CodeGen/MachineScheduler.cpp              |   2 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp       | 107 +++++++++++++--------
 llvm/lib/Target/AArch64/AArch64InstrInfo.h         |   6 ++
 .../Target/AArch64/AArch64LoadStoreOptimizer.cpp   |  95 +++---------------
 llvm/test/CodeGen/AArch64/arm64-memset-inline.ll   |   6 +-
 llvm/test/CodeGen/AArch64/cluster-frame-index.mir  |  30 +++++-
 llvm/test/CodeGen/AArch64/expand-select.ll         |  38 ++++----
 .../test/CodeGen/AArch64/tailcall_misched_graph.ll |   4 +-
 8 files changed, 143 insertions(+), 145 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 92dbc48..a100966 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1498,7 +1498,7 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
                                 : BaseOp->getIndex() < RHS.BaseOp->getIndex();
 
         if (Offset != RHS.Offset)
-          return StackGrowsDown ? Offset > RHS.Offset : Offset < RHS.Offset;
+          return Offset < RHS.Offset;
 
         return SU->NodeNum < RHS.SU->NodeNum;
       }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 18c098b..45a2d77 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2230,54 +2230,82 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
   return true;
 }
 
-static unsigned getOffsetStride(unsigned Opc) {
+// Scaling factor for unscaled load or store.
+int AArch64InstrInfo::getMemScale(unsigned Opc) {
   switch (Opc) {
   default:
-    return 0;
-  case AArch64::LDURQi:
-  case AArch64::STURQi:
-    return 16;
-  case AArch64::LDURXi:
-  case AArch64::LDURDi:
-  case AArch64::STURXi:
-  case AArch64::STURDi:
-    return 8;
-  case AArch64::LDURWi:
+    llvm_unreachable("Opcode has unknown scale!");
+  case AArch64::LDRBBui:
+  case AArch64::LDURBBi:
+  case AArch64::LDRSBWui:
+  case AArch64::LDURSBWi:
+  case AArch64::STRBBui:
+  case AArch64::STURBBi:
+    return 1;
+  case AArch64::LDRHHui:
+  case AArch64::LDURHHi:
+  case AArch64::LDRSHWui:
+  case AArch64::LDURSHWi:
+  case AArch64::STRHHui:
+  case AArch64::STURHHi:
+    return 2;
+  case AArch64::LDRSui:
   case AArch64::LDURSi:
+  case AArch64::LDRSWui:
   case AArch64::LDURSWi:
-  case AArch64::STURWi:
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+  case AArch64::STRSui:
   case AArch64::STURSi:
+  case AArch64::STRWui:
+  case AArch64::STURWi:
+  case AArch64::LDPSi:
+  case AArch64::LDPSWi:
+  case AArch64::LDPWi:
+  case AArch64::STPSi:
+  case AArch64::STPWi:
     return 4;
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
+  case AArch64::STRDui:
+  case AArch64::STURDi:
+  case AArch64::STRXui:
+  case AArch64::STURXi:
+  case AArch64::LDPDi:
+  case AArch64::LDPXi:
+  case AArch64::STPDi:
+  case AArch64::STPXi:
+    return 8;
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
+  case AArch64::STRQui:
+  case AArch64::STURQi:
+  case AArch64::LDPQi:
+  case AArch64::STPQi:
+  case AArch64::STGOffset:
+  case AArch64::STZGOffset:
+  case AArch64::ST2GOffset:
+  case AArch64::STZ2GOffset:
+  case AArch64::STGPi:
+    return 16;
   }
 }
 
 // Scale the unscaled offsets.  Returns false if the unscaled offset can't be
 // scaled.
 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
-  unsigned OffsetStride = getOffsetStride(Opc);
-  if (OffsetStride == 0)
-    return false;
+  int Scale = AArch64InstrInfo::getMemScale(Opc);
+
   // If the byte-offset isn't a multiple of the stride, we can't scale this
   // offset.
-  if (Offset % OffsetStride != 0)
+  if (Offset % Scale != 0)
     return false;
 
   // Convert the byte-offset used by unscaled into an "element" offset used
   // by the scaled pair load/store instructions.
-  Offset /= OffsetStride;
-  return true;
-}
-
-// Unscale the scaled offsets. Returns false if the scaled offset can't be
-// unscaled.
-static bool unscaleOffset(unsigned Opc, int64_t &Offset) {
-  unsigned OffsetStride = getOffsetStride(Opc);
-  if (OffsetStride == 0)
-    return false;
-
-  // Convert the "element" offset used by scaled pair load/store instructions
-  // into the byte-offset used by unscaled.
-  Offset *= OffsetStride;
+  Offset /= Scale;
   return true;
 }
 
@@ -2308,15 +2336,17 @@ static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
     int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
     int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
     assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
-    // Get the byte-offset from the object offset.
-    if (!unscaleOffset(Opcode1, Offset1) || !unscaleOffset(Opcode2, Offset2))
+    // Convert to scaled object offsets.
+    int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
+    if (ObjectOffset1 % Scale1 != 0)
       return false;
+    ObjectOffset1 /= Scale1;
+    int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
+    if (ObjectOffset2 % Scale2 != 0)
+      return false;
+    ObjectOffset2 /= Scale2;
     ObjectOffset1 += Offset1;
     ObjectOffset2 += Offset2;
-    // Get the "element" index in the object.
-    if (!scaleOffset(Opcode1, ObjectOffset1) ||
-        !scaleOffset(Opcode2, ObjectOffset2))
-      return false;
     return ObjectOffset1 + 1 == ObjectOffset2;
   }
 
@@ -2376,7 +2406,7 @@ bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
   // The caller should already have ordered First/SecondLdSt by offset.
   // Note: except for non-equal frame index bases
   if (BaseOp1.isFI()) {
-    assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 >= Offset2) &&
+    assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
            "Caller should have ordered offsets.");
 
     const MachineFrameInfo &MFI =
@@ -2385,8 +2415,7 @@ bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
                            BaseOp2.getIndex(), Offset2, SecondOpc);
   }
 
-  assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
-         "Caller should have ordered offsets.");
+  assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
 
   return Offset1 + 1 == Offset2;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index c3d2783..66e517e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -89,6 +89,12 @@ public:
   /// if there is a corresponding unscaled variant available.
   static Optional<unsigned> getUnscaledLdSt(unsigned Opc);
 
+  /// Scaling factor for (scaled or unscaled) load or store.
+  static int getMemScale(unsigned Opc);
+  static int getMemScale(const MachineInstr &MI) {
+    return getMemScale(MI.getOpcode());
+  }
+
 
   /// Returns the index for the immediate for a given instruction.
   static unsigned getLoadStoreImmIdx(unsigned Opc);
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 296115c..d24e6d6 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -230,69 +230,6 @@ static bool isTagStore(const MachineInstr &MI) {
   }
 }
 
-// Scaling factor for unscaled load or store.
-static int getMemScale(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default:
-    llvm_unreachable("Opcode has unknown scale!");
-  case AArch64::LDRBBui:
-  case AArch64::LDURBBi:
-  case AArch64::LDRSBWui:
-  case AArch64::LDURSBWi:
-  case AArch64::STRBBui:
-  case AArch64::STURBBi:
-    return 1;
-  case AArch64::LDRHHui:
-  case AArch64::LDURHHi:
-  case AArch64::LDRSHWui:
-  case AArch64::LDURSHWi:
-  case AArch64::STRHHui:
-  case AArch64::STURHHi:
-    return 2;
-  case AArch64::LDRSui:
-  case AArch64::LDURSi:
-  case AArch64::LDRSWui:
-  case AArch64::LDURSWi:
-  case AArch64::LDRWui:
-  case AArch64::LDURWi:
-  case AArch64::STRSui:
-  case AArch64::STURSi:
-  case AArch64::STRWui:
-  case AArch64::STURWi:
-  case AArch64::LDPSi:
-  case AArch64::LDPSWi:
-  case AArch64::LDPWi:
-  case AArch64::STPSi:
-  case AArch64::STPWi:
-    return 4;
-  case AArch64::LDRDui:
-  case AArch64::LDURDi:
-  case AArch64::LDRXui:
-  case AArch64::LDURXi:
-  case AArch64::STRDui:
-  case AArch64::STURDi:
-  case AArch64::STRXui:
-  case AArch64::STURXi:
-  case AArch64::LDPDi:
-  case AArch64::LDPXi:
-  case AArch64::STPDi:
-  case AArch64::STPXi:
-    return 8;
-  case AArch64::LDRQui:
-  case AArch64::LDURQi:
-  case AArch64::STRQui:
-  case AArch64::STURQi:
-  case AArch64::LDPQi:
-  case AArch64::STPQi:
-  case AArch64::STGOffset:
-  case AArch64::STZGOffset:
-  case AArch64::ST2GOffset:
-  case AArch64::STZ2GOffset:
-  case AArch64::STGPi:
-    return 16;
-  }
-}
-
 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
                                          bool *IsValidLdStrOpc = nullptr) {
   if (IsValidLdStrOpc)
@@ -603,7 +540,7 @@ static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
   // ST*G and all paired ldst have the same scale in pre/post-indexed variants
   // as in the "unsigned offset" variant.
   // All other pre/post indexed ldst instructions are unscaled.
-  Scale = (IsTagStore || IsPaired) ? getMemScale(MI) : 1;
+  Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
 
   if (IsPaired) {
     MinOffset = -64;
@@ -635,8 +572,8 @@ static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
                                   MachineInstr &StoreInst,
                                   const AArch64InstrInfo *TII) {
   assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
-  int LoadSize = getMemScale(LoadInst);
-  int StoreSize = getMemScale(StoreInst);
+  int LoadSize = TII->getMemScale(LoadInst);
+  int StoreSize = TII->getMemScale(StoreInst);
   int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
                              ? getLdStOffsetOp(StoreInst).getImm()
                              : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
@@ -746,7 +683,7 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
 
   unsigned Opc = I->getOpcode();
   bool IsScaled = !TII->isUnscaledLdSt(Opc);
-  int OffsetStride = IsScaled ? 1 : getMemScale(*I);
+  int OffsetStride = IsScaled ? 1 : TII->getMemScale(*I);
 
   bool MergeForward = Flags.getMergeForward();
   // Insert our new paired instruction after whichever of the paired
@@ -853,7 +790,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
   unsigned Opc =
       SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
   bool IsUnscaled = TII->isUnscaledLdSt(Opc);
-  int OffsetStride = IsUnscaled ? getMemScale(*I) : 1;
+  int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1;
 
   bool MergeForward = Flags.getMergeForward();
 
@@ -938,11 +875,11 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
     // We're trying to pair instructions that differ in how they are scaled.  If
     // I is scaled then scale the offset of Paired accordingly.  Otherwise, do
     // the opposite (i.e., make Paired's offset unscaled).
-    int MemSize = getMemScale(*Paired);
+    int MemSize = TII->getMemScale(*Paired);
     if (PairedIsUnscaled) {
       // If the unscaled offset isn't a multiple of the MemSize, we can't
       // pair the operations together.
-      assert(!(PairedOffset % getMemScale(*Paired)) &&
+      assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
              "Offset should be a multiple of the stride!");
       PairedOffset /= MemSize;
     } else {
@@ -967,9 +904,9 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
   int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
   // Scale the immediate offset, if necessary.
   if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
-    assert(!(OffsetImm % getMemScale(*RtMI)) &&
+    assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
            "Unscaled offset cannot be scaled.");
-    OffsetImm /= getMemScale(*RtMI);
+    OffsetImm /= TII->getMemScale(*RtMI);
   }
 
   // Construct the new instruction.
@@ -1069,8 +1006,8 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
   MachineBasicBlock::iterator NextI = LoadI;
   ++NextI;
 
-  int LoadSize = getMemScale(*LoadI);
-  int StoreSize = getMemScale(*StoreI);
+  int LoadSize = TII->getMemScale(*LoadI);
+  int StoreSize = TII->getMemScale(*StoreI);
   Register LdRt = getLdStRegOp(*LoadI).getReg();
   const MachineOperand &StMO = getLdStRegOp(*StoreI);
   Register StRt = getLdStRegOp(*StoreI).getReg();
@@ -1489,7 +1426,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
   Register Reg = getLdStRegOp(FirstMI).getReg();
   Register BaseReg = getLdStBaseOp(FirstMI).getReg();
   int Offset = getLdStOffsetOp(FirstMI).getImm();
-  int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
+  int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;
   bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
 
   Optional<bool> MaybeCanRename = None;
@@ -1534,7 +1471,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // We're trying to pair instructions that differ in how they are scaled.
         // If FirstMI is scaled then scale the offset of MI accordingly.
         // Otherwise, do the opposite (i.e., make MI's offset unscaled).
-        int MemSize = getMemScale(MI);
+        int MemSize = TII->getMemScale(MI);
         if (MIIsUnscaled) {
           // If the unscaled offset isn't a multiple of the MemSize, we can't
           // pair the operations together: bail and keep looking.
@@ -1792,7 +1729,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
   MachineBasicBlock::iterator MBBI = I;
 
   Register BaseReg = getLdStBaseOp(MemMI).getReg();
-  int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
+  int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * TII->getMemScale(MemMI);
 
   // Scan forward looking for post-index opportunities.  Updating instructions
   // can't be formed if the memory instruction doesn't have the offset we're
@@ -1963,7 +1900,7 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
   // with Offset-1)
   bool IsUnscaled = TII->isUnscaledLdSt(MI);
   int Offset = getLdStOffsetOp(MI).getImm();
-  int OffsetStride = IsUnscaled ? getMemScale(MI) : 1;
+  int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
   // Allow one more for offset.
   if (Offset > 0)
     Offset -= OffsetStride;
@@ -2029,7 +1966,7 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
   // The immediate in the load/store is scaled by the size of the memory
   // operation. The immediate in the add we're looking for,
   // however, is not, so adjust here.
-  int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
+  int UnscaledOffset = getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
 
   // Look forward to try to find a pre-index instruction. For example,
   // ldr x1, [x0, #64]
diff --git a/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll b/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll
index 460ccc2..d0b1521 100644
--- a/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll
@@ -113,9 +113,9 @@ define void @bzero_20_stack() {
 
 define void @bzero_26_stack() {
 ; CHECK-LABEL: bzero_26_stack:
-; CHECK:       stp xzr, xzr, [sp, #8]
-; CHECK-NEXT:  str xzr, [sp]
+; CHECK:       stp xzr, xzr, [sp]
 ; CHECK-NEXT:  strh wzr, [sp, #24]
+; CHECK-NEXT:  str xzr, [sp, #16]
 ; CHECK-NEXT:  bl something
   %buf = alloca [26 x i8], align 1
   %cast = bitcast [26 x i8]* %buf to i8*
@@ -259,9 +259,9 @@ define void @memset_12_stack() {
 define void @memset_16_stack() {
 ; CHECK-LABEL: memset_16_stack:
 ; CHECK:       mov x8, #-6148914691236517206
+; CHECK-NEXT:  str x8, [sp, #-32]!
 ; CHECK-NEXT:  mov x0, sp
 ; CHECK-NEXT:  stp x8, x30, [sp, #8]
-; CHECK-NEXT:  str x8, [sp]
 ; CHECK-NEXT:  bl something
   %buf = alloca [16 x i8], align 1
   %cast = bitcast [16 x i8]* %buf to i8*
diff --git a/llvm/test/CodeGen/AArch64/cluster-frame-index.mir b/llvm/test/CodeGen/AArch64/cluster-frame-index.mir
index 75ccdc4..74582b3 100644
--- a/llvm/test/CodeGen/AArch64/cluster-frame-index.mir
+++ b/llvm/test/CodeGen/AArch64/cluster-frame-index.mir
@@ -1,11 +1,10 @@
 #RUN: llc -mtriple=aarch64-- -mcpu=cyclone -run-pass machine-scheduler -o - %s | FileCheck %s
-...
 ---
 name:            merge_stack
 # CHECK-LABEL: name: merge_stack
 tracksRegLiveness: true
 stack:
-  - { id: 0, size: 64, alignment: 8 }
+  - { id: 0, size: 16, alignment: 8 }
 body:             |
   bb.0:
     liveins: $w0, $w1
@@ -25,3 +24,30 @@ body:             |
     ; CHECK-NEXT: STRXui
     ; CHECK-NEXT: STRXui
     ; CHECK-NEXT: RET
+...
+---
+name:            merge_fixedstack
+# CHECK-LABEL: name: merge_fixedstack
+tracksRegLiveness: true
+fixedStack:
+  - { id: 0, size: 16, alignment: 8, offset: -16 }
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    %0:gpr32 = COPY $w0
+    %1:gpr32 = COPY $w1
+    undef %3.sub_32:gpr64 = ORRWrs $wzr, %0, 0
+    STRXui %3, %fixed-stack.0, 0 :: (store 8)
+    undef %5.sub_32:gpr64 = ORRWrs $wzr, %1, 0
+    STRXui %5, %fixed-stack.0, 1 :: (store 8)
+    RET_ReallyLR
+
+    ; CHECK: COPY
+    ; CHECK-NEXT: COPY
+    ; CHECK-NEXT: ORRWrs
+    ; CHECK-NEXT: ORRWrs
+    ; CHECK-NEXT: STRXui
+    ; CHECK-NEXT: STRXui
+    ; CHECK-NEXT: RET
+...
diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll
index da1e8dc..d1f49e1 100644
--- a/llvm/test/CodeGen/AArch64/expand-select.ll
+++ b/llvm/test/CodeGen/AArch64/expand-select.ll
@@ -4,20 +4,20 @@
 define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, <2 x i128> *%Out) {
 ; CHECK-LABEL: foo:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w9, w0, #0x1
+; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    fmov s0, wzr
-; CHECK-NEXT:    ldp x10, x8, [sp, #8]
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    ldr x9, [sp]
+; CHECK-NEXT:    ldp x10, x9, [sp, #8]
+; CHECK-NEXT:    fmov s1, w8
+; CHECK-NEXT:    ldr x8, [sp]
 ; CHECK-NEXT:    cmeq v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    fmov w11, s0
 ; CHECK-NEXT:    tst w11, #0x1
 ; CHECK-NEXT:    csel x11, x2, x6, ne
 ; CHECK-NEXT:    csel x12, x3, x7, ne
-; CHECK-NEXT:    csel x9, x4, x9, ne
+; CHECK-NEXT:    csel x8, x4, x8, ne
 ; CHECK-NEXT:    csel x10, x5, x10, ne
-; CHECK-NEXT:    stp x9, x10, [x8, #16]
-; CHECK-NEXT:    stp x11, x12, [x8]
+; CHECK-NEXT:    stp x8, x10, [x9, #16]
+; CHECK-NEXT:    stp x11, x12, [x9]
 ; CHECK-NEXT:    ret
   %cond = and i32 %In1, 1
   %cbool = icmp eq i32 %cond, 0
@@ -31,25 +31,25 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, <2 x i128> *%Out) {
 define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, <2 x i96> *%Out) {
 ; CHECK-LABEL: bar:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w10, w0, #0x1
+; CHECK-NEXT:    and w9, w0, #0x1
 ; CHECK-NEXT:    fmov s0, wzr
-; CHECK-NEXT:    fmov s1, w10
+; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    cmeq v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    ldp x11, x8, [sp, #8]
-; CHECK-NEXT:    ldr x9, [sp]
+; CHECK-NEXT:    ldr x10, [sp]
 ; CHECK-NEXT:    dup v1.4s, v0.s[0]
-; CHECK-NEXT:    mov x10, v1.d[1]
-; CHECK-NEXT:    lsr x10, x10, #32
-; CHECK-NEXT:    tst w10, #0x1
-; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    mov x9, v1.d[1]
+; CHECK-NEXT:    lsr x9, x9, #32
+; CHECK-NEXT:    tst w9, #0x1
+; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    csel x11, x5, x11, ne
-; CHECK-NEXT:    csel x9, x4, x9, ne
-; CHECK-NEXT:    tst w10, #0x1
-; CHECK-NEXT:    csel x10, x3, x7, ne
+; CHECK-NEXT:    csel x10, x4, x10, ne
+; CHECK-NEXT:    tst w9, #0x1
+; CHECK-NEXT:    csel x9, x3, x7, ne
 ; CHECK-NEXT:    csel x12, x2, x6, ne
-; CHECK-NEXT:    stur x9, [x8, #12]
+; CHECK-NEXT:    stur x10, [x8, #12]
 ; CHECK-NEXT:    str x12, [x8]
-; CHECK-NEXT:    str w10, [x8, #8]
+; CHECK-NEXT:    str w9, [x8, #8]
 ; CHECK-NEXT:    str w11, [x8, #20]
 ; CHECK-NEXT:    ret
   %cond = and i32 %In1, 1
diff --git a/llvm/test/CodeGen/AArch64/tailcall_misched_graph.ll b/llvm/test/CodeGen/AArch64/tailcall_misched_graph.ll
index 090ab37..776ec39 100644
--- a/llvm/test/CodeGen/AArch64/tailcall_misched_graph.ll
+++ b/llvm/test/CodeGen/AArch64/tailcall_misched_graph.ll
@@ -39,8 +39,8 @@ declare void @callee2(i8*, i8*, i8*, i8*, i8*,
 ; Make sure that there is an dependence edge between fi#-2 and fi#-4.
 ; Without this edge the scheduler would be free to move the store accross the load.
 
-; COMMON: SU({{.*}}):   [[VRB]]:gpr64 = LDRXui %fixed-stack.2
-; COMMON-NOT: SU
+; COMMON: {{^SU(.*)}}:   [[VRB]]:gpr64 = LDRXui %fixed-stack.2
+; COMMON-NOT: {{^SU(.*)}}:
 ; COMMON:  Successors:
 ; COMMON:   SU([[DEPSTOREB:.*]]): Ord  Latency=0
 ; COMMON:   SU([[DEPSTOREA:.*]]): Ord  Latency=0
-- 
2.7.4