From 5ab4a4793efbe8805c3fedc8f6625ebad987ce7b Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <peter@pcc.me.uk>
Date: Mon, 23 Apr 2018 19:09:34 +0000
Subject: [PATCH] Reland r329956, "AArch64: Introduce a DAG combine for folding
 offsets into addresses.", with a fix for the bot failure.

This reland includes a check to prevent the DAG combiner from folding an
offset that is smaller than the existing one. This can cause oscillations
between two possible DAGs, which was the cause of the hang and later assertion
failure observed on the lnt-ctmark-aarch64-O3-flto bot.
http://green.lab.llvm.org/green/job/lnt-ctmark-aarch64-O3-flto/2024/

Original commit message:
> This is a code size win in code that takes offseted addresses
> frequently, such as C++ constructors that typically need to compute
> an offseted address of a vtable. This reduces the size of Chromium
> for Android's .text section by 108KB.

Differential Revision: https://reviews.llvm.org/D45199

llvm-svn: 330630
---
 llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp    |  18 +--
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp    |  71 ++++++++++--
 llvm/test/CodeGen/AArch64/arm64-addrmode.ll        |  31 +++--
 llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll     | 129 ++++++++++++++-------
 llvm/test/CodeGen/AArch64/fold-global-offsets.ll   |  69 +++++++++++
 llvm/test/CodeGen/AArch64/global-merge-3.ll        |   4 +-
 .../global-merge-ignore-single-use-minsize.ll      |  11 +-
 .../AArch64/global-merge-ignore-single-use.ll      |   6 +-
 8 files changed, 256 insertions(+), 83 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/fold-global-offsets.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index eee59f1..d44eee0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -743,14 +743,16 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
     if (!GAN)
       return true;
 
-    const GlobalValue *GV = GAN->getGlobal();
-    unsigned Alignment = GV->getAlignment();
-    Type *Ty = GV->getValueType();
-    if (Alignment == 0 && Ty->isSized())
-      Alignment = DL.getABITypeAlignment(Ty);
-
-    if (Alignment >= Size)
-      return true;
+    if (GAN->getOffset() % Size == 0) {
+      const GlobalValue *GV = GAN->getGlobal();
+      unsigned Alignment = GV->getAlignment();
+      Type *Ty = GV->getValueType();
+      if (Alignment == 0 && Ty->isSized())
+        Alignment = DL.getABITypeAlignment(Ty);
+
+      if (Alignment >= Size)
+        return true;
+    }
   }
 
   if (CurDAG->isBaseWithConstantOffset(N)) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 19573e1..e12aeb4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -577,6 +577,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 
+  setTargetDAGCombine(ISD::GlobalAddress);
+
   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
@@ -3677,7 +3679,8 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
                                              SelectionDAG &DAG,
                                              unsigned Flag) const {
-  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag);
+  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
+                                    N->getOffset(), Flag);
 }
 
 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
@@ -3752,8 +3755,9 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
   unsigned char OpFlags =
       Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
 
-  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
-         "unexpected offset in global node");
+  if (OpFlags != AArch64II::MO_NO_FLAG)
+    assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
+           "unexpected offset in global node");
 
   // This also catches the large code model case for Darwin.
   if ((OpFlags & AArch64II::MO_GOT) != 0) {
@@ -4991,10 +4995,8 @@ SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
 
 bool AArch64TargetLowering::isOffsetFoldingLegal(
     const GlobalAddressSDNode *GA) const {
-  DEBUG(dbgs() << "Skipping offset folding global address: ");
-  DEBUG(GA->dump());
-  DEBUG(dbgs() << "AArch64 doesn't support folding offsets into global "
-        "addresses\n");
+  // Offsets are folded in the DAG combine rather than here so that we can
+  // intelligently choose an offset based on the uses.
   return false;
 }
 
@@ -10617,6 +10619,59 @@ static SDValue performNVCASTCombine(SDNode *N) {
   return SDValue();
 }
 
+// If all users of the globaladdr are of the form (globaladdr + constant), find
+// the smallest constant, fold it into the globaladdr's offset and rewrite the
+// globaladdr as (globaladdr + constant) - constant.
+static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
+                                           const AArch64Subtarget *Subtarget,
+                                           const TargetMachine &TM) {
+  auto *GN = dyn_cast<GlobalAddressSDNode>(N);
+  if (!GN || Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
+                 AArch64II::MO_NO_FLAG)
+    return SDValue();
+
+  uint64_t MinOffset = -1ull;
+  for (SDNode *N : GN->uses()) {
+    if (N->getOpcode() != ISD::ADD)
+      return SDValue();
+    auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
+    if (!C)
+      C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!C)
+      return SDValue();
+    MinOffset = std::min(MinOffset, C->getZExtValue());
+  }
+  uint64_t Offset = MinOffset + GN->getOffset();
+
+  // Require that the new offset is larger than the existing one. Otherwise, we
+  // can end up oscillating between two possible DAGs, for example,
+  // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
+  if (Offset <= uint64_t(GN->getOffset()))
+    return SDValue();
+
+  // Check whether folding this offset is legal. It must not go out of bounds of
+  // the referenced object to avoid violating the code model, and must be
+  // smaller than 2^21 because this is the largest offset expressible in all
+  // object formats.
+  //
+  // This check also prevents us from folding negative offsets, which will end
+  // up being treated in the same way as large positive ones. They could also
+  // cause code model violations, and aren't really common enough to matter.
+  if (Offset >= (1 << 21))
+    return SDValue();
+
+  const GlobalValue *GV = GN->getGlobal();
+  Type *T = GV->getValueType();
+  if (!T->isSized() ||
+      Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
+    return SDValue();
+
+  SDLoc DL(GN);
+  SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
+  return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
+                     DAG.getConstant(MinOffset, DL, MVT::i64));
+}
+
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -10704,6 +10759,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     default:
       break;
     }
+  case ISD::GlobalAddress:
+    return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
   }
   return SDValue();
 }
diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index 6da7679..16f8d01 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -5,32 +5,31 @@
 
 ; base + offset (imm9)
 ; CHECK: @t1
-; CHECK: ldr xzr, [x{{[0-9]+}}, #8]
+; CHECK: ldr xzr, [x0, #8]
 ; CHECK: ret
-define void @t1() {
-  %incdec.ptr = getelementptr inbounds i64, i64* @object, i64 1
+define void @t1(i64* %object) {
+  %incdec.ptr = getelementptr inbounds i64, i64* %object, i64 1
   %tmp = load volatile i64, i64* %incdec.ptr, align 8
   ret void
 }
 
 ; base + offset (> imm9)
 ; CHECK: @t2
-; CHECK: sub [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #264
+; CHECK: sub [[ADDREG:x[0-9]+]], x0, #264
 ; CHECK: ldr xzr, [
-; CHECK: [[ADDREG]]]
 ; CHECK: ret
-define void @t2() {
-  %incdec.ptr = getelementptr inbounds i64, i64* @object, i64 -33
+define void @t2(i64* %object) {
+  %incdec.ptr = getelementptr inbounds i64, i64* %object, i64 -33
   %tmp = load volatile i64, i64* %incdec.ptr, align 8
   ret void
 }
 
 ; base + unsigned offset (> imm9 and <= imm12 * size of type in bytes)
 ; CHECK: @t3
-; CHECK: ldr xzr, [x{{[0-9]+}}, #32760]
+; CHECK: ldr xzr, [x0, #32760]
 ; CHECK: ret
-define void @t3() {
-  %incdec.ptr = getelementptr inbounds i64, i64* @object, i64 4095
+define void @t3(i64* %object) {
+  %incdec.ptr = getelementptr inbounds i64, i64* %object, i64 4095
   %tmp = load volatile i64, i64* %incdec.ptr, align 8
   ret void
 }
@@ -38,10 +37,10 @@ define void @t3() {
 ; base + unsigned offset (> imm12 * size of type in bytes)
 ; CHECK: @t4
 ; CHECK: orr w[[NUM:[0-9]+]], wzr, #0x8000
-; CHECK: ldr xzr, [x{{[0-9]+}}, x[[NUM]]]
+; CHECK: ldr xzr, [x0, x[[NUM]]]
 ; CHECK: ret
-define void @t4() {
-  %incdec.ptr = getelementptr inbounds i64, i64* @object, i64 4096
+define void @t4(i64* %object) {
+  %incdec.ptr = getelementptr inbounds i64, i64* %object, i64 4096
   %tmp = load volatile i64, i64* %incdec.ptr, align 8
   ret void
 }
@@ -58,12 +57,12 @@ define void @t5(i64 %a) {
 
 ; base + reg + imm
 ; CHECK: @t6
-; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, x{{[0-9]+}}, lsl #3
+; CHECK: add [[ADDREG:x[0-9]+]], x1, x0, lsl #3
 ; CHECK-NEXT: orr w[[NUM:[0-9]+]], wzr, #0x8000
 ; CHECK: ldr xzr, [x{{[0-9]+}}, x[[NUM]]]
 ; CHECK: ret
-define void @t6(i64 %a) {
-  %tmp1 = getelementptr inbounds i64, i64* @object, i64 %a
+define void @t6(i64 %a, i64* %object) {
+  %tmp1 = getelementptr inbounds i64, i64* %object, i64 %a
   %incdec.ptr = getelementptr inbounds i64, i64* %tmp1, i64 4096
   %tmp = load volatile i64, i64* %incdec.ptr, align 8
   ret void
diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
index 938b3d1..6e530cb 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
@@ -264,149 +264,196 @@ entry:
 
 ; Add a bunch of tests for rdar://13258794: Match LDUR/STUR for D and Q
 ; registers for unscaled vector accesses
-@str = global [63 x i8] c"Test case for rdar://13258794: LDUR/STUR for D and Q registers\00", align 1
 
-define <1 x i64> @fct0() nounwind readonly ssp {
+define <1 x i64> @fct0(i8* %str) nounwind readonly ssp {
 entry:
 ; CHECK-LABEL: fct0:
 ; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <1 x i64>, <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 3) to <1 x i64>*), align 8
+  %p = getelementptr inbounds i8, i8* %str, i64 3
+  %q = bitcast i8* %p to <1 x i64>*
+  %0 = load <1 x i64>, <1 x i64>* %q, align 8
   ret <1 x i64> %0
 }
 
-define <2 x i32> @fct1() nounwind readonly ssp {
+define <2 x i32> @fct1(i8* %str) nounwind readonly ssp {
 entry:
 ; CHECK-LABEL: fct1:
 ; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <2 x i32>, <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 3) to <2 x i32>*), align 8
+  %p = getelementptr inbounds i8, i8* %str, i64 3
+  %q = bitcast i8* %p to <2 x i32>*
+  %0 = load <2 x i32>, <2 x i32>* %q, align 8
   ret <2 x i32> %0
 }
 
-define <4 x i16> @fct2() nounwind readonly ssp {
+define <4 x i16> @fct2(i8* %str) nounwind readonly ssp {
 entry:
 ; CHECK-LABEL: fct2:
 ; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <4 x i16>, <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 3) to <4 x i16>*), align 8
+  %p = getelementptr inbounds i8, i8* %str, i64 3
+  %q = bitcast i8* %p to <4 x i16>*
+  %0 = load <4 x i16>, <4 x i16>* %q, align 8
   ret <4 x i16> %0
 }
 
-define <8 x i8> @fct3() nounwind readonly ssp {
+define <8 x i8> @fct3(i8* %str) nounwind readonly ssp {
 entry:
 ; CHECK-LABEL: fct3:
 ; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 3) to <8 x i8>*), align 8
+  %p = getelementptr inbounds i8, i8* %str, i64 3
+  %q = bitcast i8* %p to <8 x i8>*
+  %0 = load <8 x i8>, <8 x i8>* %q, align 8
   ret <8 x i8> %0
 }
 
-define <2 x i64> @fct4() nounwind readonly ssp {
+define <2 x i64> @fct4(i8* %str) nounwind readonly ssp {
 entry:
 ; CHECK-LABEL: fct4:
 ; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <2 x i64>, <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 3) to <2 x i64>*), align 16
+  %p = getelementptr inbounds i8, i8* %str, i64 3
+  %q = bitcast i8* %p to <2 x i64>*
+  %0 = load <2 x i64>, <2 x i64>* %q, align 16
   ret <2 x i64> %0
 }
 
-define <4 x i32> @fct5() nounwind readonly ssp {
+define <4 x i32> @fct5(i8* %str) nounwind readonly ssp {
 entry:
 ; CHECK-LABEL: fct5:
 ; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <4 x i32>, <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 3) to <4 x i32>*), align 16
+  %p = getelementptr inbounds i8, i8* %str, i64 3
+  %q = bitcast i8* %p to <4 x i32>*
+  %0 = load <4 x i32>, <4 x i32>* %q, align 16
   ret <4 x i32> %0
 }
 
-define <8 x i16> @fct6() nounwind readonly ssp {
+define <8 x i16> @fct6(i8* %str) nounwind readonly ssp {
 entry:
 ; CHECK-LABEL: fct6:
 ; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <8 x i16>, <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 3) to <8 x i16>*), align 16
+  %p = getelementptr inbounds i8, i8* %str, i64 3
+  %q = bitcast i8* %p to <8 x i16>*
+  %0 = load <8 x i16>, <8 x i16>* %q, align 16
   ret <8 x i16> %0
 }
 
-define <16 x i8> @fct7() nounwind readonly ssp {
+define <16 x i8> @fct7(i8* %str) nounwind readonly ssp {
 entry:
 ; CHECK-LABEL: fct7:
 ; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 3) to <16 x i8>*), align 16
+  %p = getelementptr inbounds i8, i8* %str, i64 3
+  %q = bitcast i8* %p to <16 x i8>*
+  %0 = load <16 x i8>, <16 x i8>* %q, align 16
   ret <16 x i8> %0
 }
 
-define void @fct8() nounwind ssp {
+define void @fct8(i8* %str) nounwind ssp {
 entry:
 ; CHECK-LABEL: fct8:
 ; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
 ; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <1 x i64>, <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 3) to <1 x i64>*), align 8
-  store <1 x i64> %0, <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 4) to <1 x i64>*), align 8
+  %p = getelementptr inbounds i8, i8* %str, i64 3
+  %q = bitcast i8* %p to <1 x i64>*
+  %0 = load <1 x i64>, <1 x i64>* %q, align 8
+  %p2 = getelementptr inbounds i8, i8* %str, i64 4
+  %q2 = bitcast i8* %p2 to <1 x i64>*
+  store <1 x i64> %0, <1 x i64>* %q2, align 8
   ret void
 }
 
-define void @fct9() nounwind ssp {
+define void @fct9(i8* %str) nounwind ssp {
 entry:
 ; CHECK-LABEL: fct9:
 ; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
 ; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <2 x i32>, <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 3) to <2 x i32>*), align 8
-  store <2 x i32> %0, <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 4) to <2 x i32>*), align 8
+  %p = getelementptr inbounds i8, i8* %str, i64 3
+  %q = bitcast i8* %p to <2 x i32>*
+  %0 = load <2 x i32>, <2 x i32>* %q, align 8
+  %p2 = getelementptr inbounds i8, i8* %str, i64 4
+  %q2 = bitcast i8* %p2 to <2 x i32>*
+  store <2 x i32> %0, <2 x i32>* %q2, align 8
   ret void
 }
 
-define void @fct10() nounwind ssp {
+define void @fct10(i8* %str) nounwind ssp {
 entry:
 ; CHECK-LABEL: fct10:
 ; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
 ; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <4 x i16>, <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 3) to <4 x i16>*), align 8
-  store <4 x i16> %0, <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 4) to <4 x i16>*), align 8
+  %p = getelementptr inbounds i8, i8* %str, i64 3
+  %q = bitcast i8* %p to <4 x i16>*
+  %0 = load <4 x i16>, <4 x i16>* %q, align 8
+  %p2 = getelementptr inbounds i8, i8* %str, i64 4
+  %q2 = bitcast i8* %p2 to <4 x i16>*
+  store <4 x i16> %0, <4 x i16>* %q2, align 8
   ret void
 }
 
-define void @fct11() nounwind ssp {
+define void @fct11(i8* %str) nounwind ssp {
 entry:
 ; CHECK-LABEL: fct11:
 ; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
 ; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 3) to <8 x i8>*), align 8
-  store <8 x i8> %0, <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 4) to <8 x i8>*), align 8
+  %p = getelementptr inbounds i8, i8* %str, i64 3
+  %q = bitcast i8* %p to <8 x i8>*
+  %0 = load <8 x i8>, <8 x i8>* %q, align 8
+  %p2 = getelementptr inbounds i8, i8* %str, i64 4
+  %q2 = bitcast i8* %p2 to <8 x i8>*
+  store <8 x i8> %0, <8 x i8>* %q2, align 8
   ret void
 }
 
-define void @fct12() nounwind ssp {
+define void @fct12(i8* %str) nounwind ssp {
 entry:
 ; CHECK-LABEL: fct12:
 ; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
 ; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <2 x i64>, <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 3) to <2 x i64>*), align 16
-  store <2 x i64> %0, <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 4) to <2 x i64>*), align 16
+  %p = getelementptr inbounds i8, i8* %str, i64 3
+  %q = bitcast i8* %p to <2 x i64>*
+  %0 = load <2 x i64>, <2 x i64>* %q, align 16
+  %p2 = getelementptr inbounds i8, i8* %str, i64 4
+  %q2 = bitcast i8* %p2 to <2 x i64>*
+  store <2 x i64> %0, <2 x i64>* %q2, align 16
   ret void
 }
 
-define void @fct13() nounwind ssp {
+define void @fct13(i8* %str) nounwind ssp {
 entry:
 ; CHECK-LABEL: fct13:
 ; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
 ; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <4 x i32>, <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 3) to <4 x i32>*), align 16
-  store <4 x i32> %0, <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 4) to <4 x i32>*), align 16
+  %p = getelementptr inbounds i8, i8* %str, i64 3
+  %q = bitcast i8* %p to <4 x i32>*
+  %0 = load <4 x i32>, <4 x i32>* %q, align 16
+  %p2 = getelementptr inbounds i8, i8* %str, i64 4
+  %q2 = bitcast i8* %p2 to <4 x i32>*
+  store <4 x i32> %0, <4 x i32>* %q2, align 16
   ret void
 }
 
-define void @fct14() nounwind ssp {
+define void @fct14(i8* %str) nounwind ssp {
 entry:
 ; CHECK-LABEL: fct14:
 ; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
 ; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <8 x i16>, <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 3) to <8 x i16>*), align 16
-  store <8 x i16> %0, <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 4) to <8 x i16>*), align 16
+  %p = getelementptr inbounds i8, i8* %str, i64 3
+  %q = bitcast i8* %p to <8 x i16>*
+  %0 = load <8 x i16>, <8 x i16>* %q, align 16
+  %p2 = getelementptr inbounds i8, i8* %str, i64 4
+  %q2 = bitcast i8* %p2 to <8 x i16>*
+  store <8 x i16> %0, <8 x i16>* %q2, align 16
   ret void
 }
 
-define void @fct15() nounwind ssp {
+define void @fct15(i8* %str) nounwind ssp {
 entry:
 ; CHECK-LABEL: fct15:
 ; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
 ; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 3) to <16 x i8>*), align 16
-  store <16 x i8> %0, <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8], [63 x i8]* @str, i64 0, i64 4) to <16 x i8>*), align 16
+  %p = getelementptr inbounds i8, i8* %str, i64 3
+  %q = bitcast i8* %p to <16 x i8>*
+  %0 = load <16 x i8>, <16 x i8>* %q, align 16
+  %p2 = getelementptr inbounds i8, i8* %str, i64 4
+  %q2 = bitcast i8* %p2 to <16 x i8>*
+  store <16 x i8> %0, <16 x i8>* %q2, align 16
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AArch64/fold-global-offsets.ll b/llvm/test/CodeGen/AArch64/fold-global-offsets.ll
new file mode 100644
index 0000000..ffcdc2b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fold-global-offsets.ll
@@ -0,0 +1,69 @@
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s
+
+@x1 = external hidden global [2 x i64]
+@x2 = external hidden global [16777216 x i64]
+@x3 = external hidden global { [9 x i8*], [8 x i8*] }
+
+define i64 @f1() {
+  ; CHECK: f1:
+  ; CHECK: adrp x8, x1+16
+  ; CHECK: ldr x0, [x8, :lo12:x1+16]
+  %l = load i64, i64* getelementptr ([2 x i64], [2 x i64]* @x1, i64 0, i64 2)
+  ret i64 %l
+}
+
+define i64 @f2() {
+  ; CHECK: f2:
+  ; CHECK: adrp x8, x1
+  ; CHECK: add x8, x8, :lo12:x1
+  ; CHECK: ldr x0, [x8, #24]
+  %l = load i64, i64* getelementptr ([2 x i64], [2 x i64]* @x1, i64 0, i64 3)
+  ret i64 %l
+}
+
+define i64 @f3() {
+  ; CHECK: f3:
+  ; CHECK: adrp x8, x1+1
+  ; CHECK: add x8, x8, :lo12:x1+1
+  ; CHECK: ldr x0, [x8]
+  %l = load i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast ([2 x i64]* @x1 to i8*), i64 1) to i64*)
+  ret i64 %l
+}
+
+define [2 x i64] @f4() {
+  ; CHECK: f4:
+  ; CHECK: adrp x8, x2+8
+  ; CHECK: add x8, x8, :lo12:x2+8
+  ; CHECK: ldp x0, x1, [x8]
+  %l = load [2 x i64], [2 x i64]* bitcast (i8* getelementptr (i8, i8* bitcast ([16777216 x i64]* @x2 to i8*), i64 8) to [2 x i64]*)
+  ret [2 x i64] %l
+}
+
+define i64 @f5() {
+  ; CHECK: f5:
+  ; CHECK: adrp x8, x2+2097144
+  ; CHECK: ldr x0, [x8, :lo12:x2+2097144]
+  ; CHECK: ret
+  %l = load i64, i64* getelementptr ([16777216 x i64], [16777216 x i64]* @x2, i64 0, i64 262143)
+  ret i64 %l
+}
+
+define i64 @f6() {
+  ; CHECK: f6:
+  ; CHECK: adrp x8, x2
+  ; CHECK: add x8, x8, :lo12:x2
+  ; CHECK: orr w9, wzr, #0x200000
+  ; CHECK: ldr x0, [x8, x9]
+  ; CHECK: ret
+  %l = load i64, i64* getelementptr ([16777216 x i64], [16777216 x i64]* @x2, i64 0, i64 262144)
+  ret i64 %l
+}
+
+define i32 @f7() {
+entry:
+  ; CHECK: f7
+  ; CHECK: adrp x8, x3+108
+  ; CHECK: ldr w0, [x8, :lo12:x3+108]
+  %l = load i32, i32* getelementptr (i32, i32* inttoptr (i64 trunc (i128 lshr (i128 bitcast (<2 x i64> <i64 undef, i64 ptrtoint (i8** getelementptr inbounds ({ [9 x i8*], [8 x i8*] }, { [9 x i8*], [8 x i8*] }* @x3, i64 0, inrange i32 1, i64 2) to i64)> to i128), i128 64) to i64) to i32*), i64 5)
+  ret i32 %l
+}
diff --git a/llvm/test/CodeGen/AArch64/global-merge-3.ll b/llvm/test/CodeGen/AArch64/global-merge-3.ll
index 106d6da..4844d96 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-3.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-3.ll
@@ -10,8 +10,8 @@ define void @f1(i32 %a1, i32 %a2, i32 %a3) {
 ;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals_x@PAGE
 ;CHECK-APPLE-IOS-NOT: adrp
 ;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals_x@PAGEOFF
-;CHECK-APPLE-IOS: adrp	x9, __MergedGlobals_y@PAGE
-;CHECK-APPLE-IOS: add	x9, x9, __MergedGlobals_y@PAGEOFF
+;CHECK-APPLE-IOS: adrp	x9, __MergedGlobals_y@PAGE+12
+;CHECK-APPLE-IOS: str	w1, [x9, __MergedGlobals_y@PAGEOFF+12]
   %x3 = getelementptr inbounds [1000 x i32], [1000 x i32]* @x, i32 0, i64 3
   %y3 = getelementptr inbounds [1000 x i32], [1000 x i32]* @y, i32 0, i64 3
   store i32 %a1, i32* %x3, align 4
diff --git a/llvm/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll b/llvm/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll
index 1c1b4f6..8207f8c 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll
@@ -44,9 +44,9 @@ define void @f2(i32 %a1, i32 %a2) nounwind {
 
 ; CHECK-LABEL: f3:
 define void @f3(i32 %a1, i32 %a2) minsize nounwind {
-; CHECK-NEXT: adrp x8, [[SET]]@PAGE
-; CHECK-NEXT: add x8, x8, [[SET]]@PAGEOFF
-; CHECK-NEXT: stp w0, w1, [x8, #8]
+; CHECK-NEXT: adrp x8, [[SET]]@PAGE+8
+; CHECK-NEXT: add x8, x8, [[SET]]@PAGEOFF+8
+; CHECK-NEXT: stp w0, w1, [x8]
 ; CHECK-NEXT: ret
   store i32 %a1, i32* @m3, align 4
   store i32 %a2, i32* @n3, align 4
@@ -57,10 +57,9 @@ define void @f3(i32 %a1, i32 %a2) minsize nounwind {
 
 ; CHECK-LABEL: f4:
 define void @f4(i32 %a1, i32 %a2) nounwind {
-; CHECK-NEXT: adrp x8, [[SET]]@PAGE
-; CHECK-NEXT: add x8, x8, [[SET]]@PAGEOFF
+; CHECK-NEXT: adrp x8, [[SET]]@PAGE+8
 ; CHECK-NEXT: adrp x9, _n4@PAGE
-; CHECK-NEXT: str w0, [x8, #8]
+; CHECK-NEXT: str w0, [x8, [[SET]]@PAGEOFF+8]
 ; CHECK-NEXT: str w1, [x9, _n4@PAGEOFF]
 ; CHECK-NEXT: ret
   store i32 %a1, i32* @m3, align 4
diff --git a/llvm/test/CodeGen/AArch64/global-merge-ignore-single-use.ll b/llvm/test/CodeGen/AArch64/global-merge-ignore-single-use.ll
index 97e283c..b3b8406 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-ignore-single-use.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-ignore-single-use.ll
@@ -38,9 +38,9 @@ define void @f2(i32 %a1, i32 %a2, i32 %a3) #0 {
 
 ; CHECK-LABEL: f3:
 define void @f3(i32 %a1, i32 %a2) #0 {
-; CHECK-NEXT: adrp x8, [[SET]]@PAGE
-; CHECK-NEXT: add x8, x8, [[SET]]@PAGEOFF
-; CHECK-NEXT: stp w0, w1, [x8, #12]
+; CHECK-NEXT: adrp x8, [[SET]]@PAGE+12
+; CHECK-NEXT: add x8, x8, [[SET]]@PAGEOFF+12
+; CHECK-NEXT: stp w0, w1, [x8]
 ; CHECK-NEXT: ret
   store i32 %a1, i32* @m2, align 4
   store i32 %a2, i32* @n2, align 4
-- 
2.7.4