ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
return selectAddrModeIndexed(Root, Width / 8);
}
+
+ bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
+ const MachineRegisterInfo &MRI) const;
+ ComplexRendererFns
+ selectAddrModeShiftedExtendXReg(MachineOperand &Root,
+ unsigned SizeInBytes) const;
ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
+ ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
+ unsigned SizeInBytes) const;
void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const;
if (DstSize != 64)
return false;
- // Check if we can do any folding from GEPs etc. into the load.
- auto ImmFn = selectAddrModeRegisterOffset(I.getOperand(1));
+ // Check if we can do any folding from GEPs/shifts etc. into the load.
+ auto ImmFn = selectAddrModeXRO(I.getOperand(1), MemBytes);
if (!ImmFn)
return false;
}};
}
+/// Return true if it is worth folding MI into an extended register. That is,
+/// if it's safe to pull it into the addressing mode of a load or store as a
+/// shift.
+bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
+ MachineInstr &MI, const MachineRegisterInfo &MRI) const {
+ // Always fold if there is one use, or if we're optimizing for size.
+ Register DefReg = MI.getOperand(0).getReg();
+ if (MRI.hasOneUse(DefReg) ||
+ MI.getParent()->getParent()->getFunction().hasMinSize())
+ return true;
+
+ // It's better to avoid folding and recomputing shifts when we don't have a
+ // fastpath.
+ if (!STI.hasLSLFast())
+ return false;
+
+ // We have a fastpath, so folding a shift in and potentially computing it
+ // many times may be beneficial. Check if this is only used in memory ops.
+ // If it is, then we should fold.
+ return all_of(MRI.use_instructions(DefReg),
+ [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
+}
+
+/// This is used for computing addresses like this:
+///
+/// ldr x1, [x2, x3, lsl #3]
+///
+/// Where x2 is the base register, and x3 is an offset register. The shift-left
+/// is a constant value specific to this load instruction. That is, we'll never
+/// see anything other than a 3 here (which corresponds to the size of the
+/// element being loaded.)
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
+ MachineOperand &Root, unsigned SizeInBytes) const {
+ if (!Root.isReg())
+ return None;
+ MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+ // Make sure that the memory op is a valid size.
+ int64_t LegalShiftVal = Log2_32(SizeInBytes);
+ if (LegalShiftVal == 0)
+ return None;
+
+ // We want to find something like this:
+ //
+ // val = G_CONSTANT LegalShiftVal
+ // shift = G_SHL off_reg val
+ // ptr = G_GEP base_reg shift
+ // x = G_LOAD ptr
+ //
+ // And fold it into this addressing mode:
+ //
+ // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
+
+ // Check if we can find the G_GEP.
+ MachineInstr *Gep = getOpcodeDef(TargetOpcode::G_GEP, Root.getReg(), MRI);
+ if (!Gep || !isWorthFoldingIntoExtendedReg(*Gep, MRI))
+ return None;
+
+ // Now try to match the G_SHL.
+ MachineInstr *Shl =
+ getOpcodeDef(TargetOpcode::G_SHL, Gep->getOperand(2).getReg(), MRI);
+ if (!Shl || !isWorthFoldingIntoExtendedReg(*Shl, MRI))
+ return None;
+
+ // Now, try to find the specific G_CONSTANT.
+ auto ValAndVReg =
+ getConstantVRegValWithLookThrough(Shl->getOperand(2).getReg(), MRI);
+ if (!ValAndVReg)
+ return None;
+
+ // The value must fit into 3 bits, and must be positive. Make sure that is
+ // true.
+ int64_t ImmVal = ValAndVReg->Value;
+ if ((ImmVal & 0x7) != ImmVal)
+ return None;
+
+ // We are only allowed to shift by LegalShiftVal. This shift value is built
+ // into the instruction, so we can't just use whatever we want.
+ if (ImmVal != LegalShiftVal)
+ return None;
+
+ // We can use the LHS of the GEP as the base, and the LHS of the shift as an
+ // offset. Signify that we are shifting by setting the shift flag to 1.
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(Gep->getOperand(1)); },
+ [=](MachineInstrBuilder &MIB) { MIB.add(Shl->getOperand(1)); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(1); },
+ }};
+}
+
/// This is used for computing addresses like this:
///
/// ldr x1, [x2, x3]
MachineOperand &Root) const {
MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
- // If we have a constant offset, then we probably don't want to match a
- // register offset.
- if (isBaseWithConstantOffset(Root, MRI))
- return None;
-
// We need a GEP.
MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
if (!Gep || Gep->getOpcode() != TargetOpcode::G_GEP)
}};
}
+/// This is intended to be equivalent to selectAddrModeXRO in
+/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
+ unsigned SizeInBytes) const {
+ MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+ // If we have a constant offset, then we probably don't want to match a
+ // register offset.
+ if (isBaseWithConstantOffset(Root, MRI))
+ return None;
+
+ // Try to fold shifts into the addressing mode.
+ auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
+ if (AddrModeFns)
+ return AddrModeFns;
+
+ // If that doesn't work, see if it's possible to fold in registers from
+ // a GEP.
+ return selectAddrModeRegisterOffset(Root);
+}
+
/// Select a "register plus unscaled signed 9-bit immediate" address. This
/// should only match when there is an offset that is not valid for a scaled
/// immediate addressing mode. The "Size" argument is the size in bytes of the
define void @ldrxrox_breg_oreg(i64* %addr) { ret void }
define void @ldrdrox_breg_oreg(i64* %addr) { ret void }
define void @more_than_one_use(i64* %addr) { ret void }
+ define void @ldrxrox_shl(i64* %addr) { ret void }
+ define void @ldrdrox_shl(i64* %addr) { ret void }
+ define void @more_than_one_use_shl_1(i64* %addr) { ret void }
+ define void @more_than_one_use_shl_2(i64* %addr) { ret void }
+ define void @more_than_one_use_shl_lsl_fast(i64* %addr) #1 { ret void }
+ define void @more_than_one_use_shl_lsl_slow(i64* %addr) { ret void }
+ define void @more_than_one_use_shl_minsize(i64* %addr) #0 { ret void }
+ attributes #0 = { optsize minsize }
+ attributes #1 = { "target-features"="+lsl-fast" }
...
---
%6:gpr(s64) = G_ADD %5, %4
$x0 = COPY %6(s64)
RET_ReallyLR implicit $x0
+
+...
+---
+name: ldrxrox_shl
+alignment: 2
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ liveins: $x0, $x1, $x2
+ ; CHECK-LABEL: name: ldrxrox_shl
+ ; CHECK: liveins: $x0, $x1, $x2
+ ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+ ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+ ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr)
+ ; CHECK: $x2 = COPY [[LDRXroX]]
+ ; CHECK: RET_ReallyLR implicit $x2
+ %0:gpr(s64) = COPY $x0
+ %1:gpr(s64) = G_CONSTANT i64 3
+ %2:gpr(s64) = G_SHL %0, %1(s64)
+ %3:gpr(p0) = COPY $x1
+ %4:gpr(p0) = G_GEP %3, %2
+ %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+ $x2 = COPY %5(s64)
+ RET_ReallyLR implicit $x2
+
+...
+---
+name: ldrdrox_shl
+alignment: 2
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ liveins: $x0, $x1, $d2
+ ; CHECK-LABEL: name: ldrdrox_shl
+ ; CHECK: liveins: $x0, $x1, $d2
+ ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+ ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+ ; CHECK: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr)
+ ; CHECK: $d2 = COPY [[LDRDroX]]
+ ; CHECK: RET_ReallyLR implicit $d2
+ %0:gpr(s64) = COPY $x0
+ %1:gpr(s64) = G_CONSTANT i64 3
+ %2:gpr(s64) = G_SHL %0, %1(s64)
+ %3:gpr(p0) = COPY $x1
+ %4:gpr(p0) = G_GEP %3, %2
+ %5:fpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+ $d2 = COPY %5(s64)
+ RET_ReallyLR implicit $d2
+
+...
+---
+name: more_than_one_use_shl_1
+alignment: 2
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ ; Show that we can still fall back to the register-register addressing
+ ; mode when we fail to pull in the shift.
+ liveins: $x0, $x1, $x2
+ ; CHECK-LABEL: name: more_than_one_use_shl_1
+ ; CHECK: liveins: $x0, $x1, $x2
+ ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+ ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60
+ ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+ ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[UBFMXri]], 0, 0 :: (load 8 from %ir.addr)
+ ; CHECK: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0
+ ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[ADDXri]]
+ ; CHECK: $x2 = COPY [[ADDXrr]]
+ ; CHECK: RET_ReallyLR implicit $x2
+ %0:gpr(s64) = COPY $x0
+ %1:gpr(s64) = G_CONSTANT i64 3
+ %2:gpr(s64) = G_SHL %0, %1(s64)
+ %3:gpr(p0) = COPY $x1
+ %4:gpr(p0) = G_GEP %3, %2
+ %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+ %6:gpr(s64) = G_ADD %2, %1
+ %7:gpr(s64) = G_ADD %5, %6
+ $x2 = COPY %7(s64)
+ RET_ReallyLR implicit $x2
+
+...
+---
+name: more_than_one_use_shl_2
+alignment: 2
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ ; Show that when the GEP is used outside a memory op, we don't do any
+ ; folding at all.
+ liveins: $x0, $x1, $x2
+ ; CHECK-LABEL: name: more_than_one_use_shl_2
+ ; CHECK: liveins: $x0, $x1, $x2
+ ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+ ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60
+ ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+ ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]]
+ ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load 8 from %ir.addr)
+ ; CHECK: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0
+ ; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[ADDXri]]
+ ; CHECK: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDXrr]]
+ ; CHECK: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[ADDXrr1]]
+ ; CHECK: $x2 = COPY [[ADDXrr2]]
+ ; CHECK: RET_ReallyLR implicit $x2
+ %0:gpr(s64) = COPY $x0
+ %1:gpr(s64) = G_CONSTANT i64 3
+ %2:gpr(s64) = G_SHL %0, %1(s64)
+ %3:gpr(p0) = COPY $x1
+ %4:gpr(p0) = G_GEP %3, %2
+ %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+ %6:gpr(s64) = G_ADD %2, %1
+ %7:gpr(s64) = G_ADD %5, %6
+ %8:gpr(s64) = G_PTRTOINT %4
+ %9:gpr(s64) = G_ADD %8, %7
+ $x2 = COPY %9(s64)
+ RET_ReallyLR implicit $x2
+
+...
+---
+name: more_than_one_use_shl_lsl_fast
+alignment: 2
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ ; Show that when we have a fastpath for shift-left, we perform the folding
+ ; if it has more than one use.
+ liveins: $x0, $x1, $x2
+ ; CHECK-LABEL: name: more_than_one_use_shl_lsl_fast
+ ; CHECK: liveins: $x0, $x1, $x2
+ ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+ ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+ ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr)
+ ; CHECK: [[LDRXroX1:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr)
+ ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[LDRXroX1]]
+ ; CHECK: $x2 = COPY [[ADDXrr]]
+ ; CHECK: RET_ReallyLR implicit $x2
+ %0:gpr(s64) = COPY $x0
+ %1:gpr(s64) = G_CONSTANT i64 3
+ %2:gpr(s64) = G_SHL %0, %1(s64)
+ %3:gpr(p0) = COPY $x1
+ %4:gpr(p0) = G_GEP %3, %2
+ %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+ %6:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+ %7:gpr(s64) = G_ADD %5, %6
+ $x2 = COPY %7(s64)
+ RET_ReallyLR implicit $x2
+
+...
+---
+name: more_than_one_use_shl_lsl_slow
+alignment: 2
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ ; Show that we don't fold into multiple memory ops when we don't have a
+ ; fastpath for shift-left.
+ liveins: $x0, $x1, $x2
+ ; CHECK-LABEL: name: more_than_one_use_shl_lsl_slow
+ ; CHECK: liveins: $x0, $x1, $x2
+ ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+ ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri [[COPY]], 61, 60
+ ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+ ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]]
+ ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load 8 from %ir.addr)
+ ; CHECK: [[LDRXui1:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load 8 from %ir.addr)
+ ; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[LDRXui1]]
+ ; CHECK: $x2 = COPY [[ADDXrr1]]
+ ; CHECK: RET_ReallyLR implicit $x2
+ %0:gpr(s64) = COPY $x0
+ %1:gpr(s64) = G_CONSTANT i64 3
+ %2:gpr(s64) = G_SHL %0, %1(s64)
+ %3:gpr(p0) = COPY $x1
+ %4:gpr(p0) = G_GEP %3, %2
+ %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+ %6:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+ %7:gpr(s64) = G_ADD %5, %6
+ $x2 = COPY %7(s64)
+ RET_ReallyLR implicit $x2
+
+...
+---
+name: more_than_one_use_shl_minsize
+alignment: 2
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ ; Show that when we're optimizing for size, we'll do the folding no matter
+ ; what.
+ liveins: $x0, $x1, $x2
+ ; CHECK-LABEL: name: more_than_one_use_shl_minsize
+ ; CHECK: liveins: $x0, $x1, $x2
+ ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+ ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60
+ ; CHECK: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+ ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[UBFMXri]]
+ ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr)
+ ; CHECK: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0
+ ; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[ADDXri]]
+ ; CHECK: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[ADDXrr1]]
+ ; CHECK: $x2 = COPY [[ADDXrr2]]
+ ; CHECK: RET_ReallyLR implicit $x2
+ %0:gpr(s64) = COPY $x0
+ %1:gpr(s64) = G_CONSTANT i64 3
+ %2:gpr(s64) = G_SHL %0, %1(s64)
+ %3:gpr(p0) = COPY $x1
+ %4:gpr(p0) = G_GEP %3, %2
+ %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+ %6:gpr(s64) = G_ADD %2, %1
+ %7:gpr(s64) = G_ADD %5, %6
+ %8:gpr(s64) = G_PTRTOINT %4
+ %9:gpr(s64) = G_ADD %8, %7
+ $x2 = COPY %9(s64)
+ RET_ReallyLR implicit $x2