From 7a1dcc5ff1da38f950f77ee22c0e843bdb6c5eb5 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Thu, 18 Jul 2019 21:50:11 +0000
Subject: [PATCH] [GlobalISel][AArch64] Add support for base register + offset
 register loads

Add support for folding G_GEPs into loads of the form

```
ldr reg, [base, off]
```

when possible. This can save an add before the load. Currently, this is only
supported for loads of 64 bits into 64 bit registers.

Add a new addressing mode function, `selectAddrModeRegisterOffset` which
performs this folding when it is profitable.

Also add a test for addressing modes for G_LOAD.

Differential Revision: https://reviews.llvm.org/D64944

llvm-svn: 366503
---
 .../Target/AArch64/AArch64InstructionSelector.cpp  | 93 ++++++++++++++++++++++
 .../AArch64/GlobalISel/load-addressing-modes.mir   | 90 +++++++++++++++++++++
 2 files changed, 183 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir

diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 4e13fb8..f8e15c8 100644
--- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -67,6 +67,7 @@ private:
   bool earlySelect(MachineInstr &I) const;
 
   bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const;
+  bool earlySelectLoad(MachineInstr &I, MachineRegisterInfo &MRI) const;
 
   bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
                           MachineRegisterInfo &MRI) const;
@@ -182,6 +183,7 @@ private:
   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
     return selectAddrModeIndexed(Root, Width / 8);
   }
+  ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
 
   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const;
 
@@ -1158,6 +1160,57 @@ bool AArch64InstructionSelector::earlySelectSHL(
   return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
 }
 
+bool AArch64InstructionSelector::earlySelectLoad(
+    MachineInstr &I, MachineRegisterInfo &MRI) const {
+  // Try to fold in shifts, etc into the addressing mode of a load.
+  assert(I.getOpcode() == TargetOpcode::G_LOAD && "unexpected op");
+
+  // Don't handle atomic loads/stores yet.
+  auto &MemOp = **I.memoperands_begin();
+  if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
+    LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+    return false;
+  }
+
+  unsigned MemBytes = MemOp.getSize();
+
+  // Only support 64-bit loads for now.
+  if (MemBytes != 8)
+    return false;
+
+  Register DstReg = I.getOperand(0).getReg();
+  const LLT DstTy = MRI.getType(DstReg);
+  // Don't handle vectors.
+  if (DstTy.isVector())
+    return false;
+
+  unsigned DstSize = DstTy.getSizeInBits();
+  // TODO: 32-bit destinations.
+  if (DstSize != 64)
+    return false;
+
+  // Check if we can do any folding from GEPs etc. into the load.
+  auto ImmFn = selectAddrModeRegisterOffset(I.getOperand(1));
+  if (!ImmFn)
+    return false;
+
+  // We can fold something. Emit the load here.
+  MachineIRBuilder MIB(I);
+
+  // Choose the instruction based off the size of the element being loaded, and
+  // whether or not we're loading into a FPR.
+  const RegisterBank &RB = *RBI.getRegBank(DstReg, MRI, TRI);
+  unsigned Opc =
+      RB.getID() == AArch64::GPRRegBankID ? AArch64::LDRXroX : AArch64::LDRDroX;
+  // Construct the load.
+  auto LoadMI = MIB.buildInstr(Opc, {DstReg}, {});
+  for (auto &RenderFn : *ImmFn)
+    RenderFn(LoadMI);
+  LoadMI.addMemOperand(*I.memoperands_begin());
+  I.eraseFromParent();
+  return constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
+}
+
 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
   assert(I.getParent() && "Instruction should be in a basic block!");
   assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -1169,6 +1222,8 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
   switch (I.getOpcode()) {
   case TargetOpcode::G_SHL:
     return earlySelectSHL(I, MRI);
+  case TargetOpcode::G_LOAD:
+    return earlySelectLoad(I, MRI);
   default:
     return false;
   }
@@ -3891,6 +3946,44 @@ AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
   }};
 }
 
+/// This is used for computing addresses like this:
+///
+/// ldr x1, [x2, x3]
+///
+/// Where x2 is the base register, and x3 is an offset register.
+///
+/// When possible (or profitable) to fold a G_GEP into the address calculation,
+/// this will do so. Otherwise, it will return None.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeRegisterOffset(
+    MachineOperand &Root) const {
+  MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+  // If we have a constant offset, then we probably don't want to match a
+  // register offset.
+  if (isBaseWithConstantOffset(Root, MRI))
+    return None;
+
+  // We need a GEP.
+  MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
+  if (!Gep || Gep->getOpcode() != TargetOpcode::G_GEP)
+    return None;
+
+  // If this is used more than once, let's not bother folding.
+  // TODO: Check if they are memory ops. If they are, then we can still fold
+  // without having to recompute anything.
+  if (!MRI.hasOneUse(Gep->getOperand(0).getReg()))
+    return None;
+
+  // Base is the GEP's LHS, offset is its RHS.
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.add(Gep->getOperand(1)); },
+      [=](MachineInstrBuilder &MIB) { MIB.add(Gep->getOperand(2)); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
+  }};
+}
+
 /// Select a "register plus unscaled signed 9-bit immediate" address.  This
 /// should only match when there is an offset that is not valid for a scaled
 /// immediate addressing mode.  The "Size" argument is the size in bytes of the
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
new file mode 100644
index 0000000..aecb772
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
@@ -0,0 +1,90 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+
+--- |
+  define void @ldrxrox_breg_oreg(i64* %addr) { ret void }
+  define void @ldrdrox_breg_oreg(i64* %addr) { ret void }
+  define void @more_than_one_use(i64* %addr) { ret void }
+...
+
+---
+name:            ldrxrox_breg_oreg
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: ldrxrox_breg_oreg
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY]], [[COPY1]], 0, 0 :: (load 8 from %ir.addr)
+    ; CHECK: $x0 = COPY [[LDRXroX]]
+    ; CHECK: RET_ReallyLR implicit $x0
+    %0:gpr(p0) = COPY $x0
+    %1:gpr(s64) = COPY $x1
+    %2:gpr(p0) = G_GEP %0, %1
+    %4:gpr(s64) = G_LOAD %2(p0) :: (load 8 from %ir.addr)
+    $x0 = COPY %4(s64)
+    RET_ReallyLR implicit $x0
+...
+
+---
+name:            ldrdrox_breg_oreg
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $d0, $x1
+    ; CHECK-LABEL: name: ldrdrox_breg_oreg
+    ; CHECK: liveins: $d0, $x1
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $d0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY]], [[COPY1]], 0, 0 :: (load 8 from %ir.addr)
+    ; CHECK: $d0 = COPY [[LDRDroX]]
+    ; CHECK: RET_ReallyLR implicit $d0
+    %0:gpr(p0) = COPY $d0
+    %1:gpr(s64) = COPY $x1
+    %2:gpr(p0) = G_GEP %0, %1
+    %4:fpr(s64) = G_LOAD %2(p0) :: (load 8 from %ir.addr)
+    $d0 = COPY %4(s64)
+    RET_ReallyLR implicit $d0
+...
+
+---
+name:            more_than_one_use
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; This shouldn't be folded, since we reuse the result of the G_GEP outside
+    ; the G_LOAD
+    ; CHECK-LABEL: name: more_than_one_use
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY]], [[COPY1]]
+    ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load 8 from %ir.addr)
+    ; CHECK: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDXrr]]
+    ; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[LDRXui]]
+    ; CHECK: $x0 = COPY [[ADDXrr1]]
+    ; CHECK: RET_ReallyLR implicit $x0
+    %0:gpr(p0) = COPY $x0
+    %1:gpr(s64) = COPY $x1
+    %2:gpr(p0) = G_GEP %0, %1
+    %4:gpr(s64) = G_LOAD %2(p0) :: (load 8 from %ir.addr)
+    %5:gpr(s64) = G_PTRTOINT %2
+    %6:gpr(s64) = G_ADD %5, %4
+    $x0 = COPY %6(s64)
+    RET_ReallyLR implicit $x0
-- 
2.7.4