(apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }])
>;
+def dup: GICombineRule <
+ (defs root:$root, shuffle_matchdata:$matchinfo),
+ (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+ [{ return matchDup(*${root}, MRI, ${matchinfo}); }]),
+ (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }])
+>;
+
// Combines which replace a G_SHUFFLE_VECTOR with a target-specific pseudo
// instruction.
-def shuffle_vector_pseudos : GICombineGroup<[rev, zip, uzp]>;
+def shuffle_vector_pseudos : GICombineGroup<[dup, rev, zip, uzp]>;
def AArch64PostLegalizerCombinerHelper
: GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper",
let InOperandList = (ins type0:$v1, type0:$v2);
}
+// Represents a dup instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_DUP: AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$lane);
+}
+
def : GINodeEquiv<G_REV16, AArch64rev16>;
def : GINodeEquiv<G_REV32, AArch64rev32>;
def : GINodeEquiv<G_REV64, AArch64rev64>;
def : GINodeEquiv<G_UZP2, AArch64uzp2>;
def : GINodeEquiv<G_ZIP1, AArch64zip1>;
def : GINodeEquiv<G_ZIP2, AArch64zip2>;
+def : GINodeEquiv<G_DUP, AArch64dup>;
const MachineInstr &MI, const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI) const {
switch (MI.getOpcode()) {
+ case AArch64::G_DUP:
case TargetOpcode::G_SITOFP:
case TargetOpcode::G_UITOFP:
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
// Some of the floating-point instructions have mixed GPR and FPR operands:
// fine-tune the computed mapping.
switch (Opc) {
+ case AArch64::G_DUP: {
+ Register ScalarReg = MI.getOperand(1).getReg();
+ auto ScalarDef = MRI.getVRegDef(ScalarReg);
+ if (getRegBank(ScalarReg, MRI, TRI) == &AArch64::FPRRegBank ||
+ onlyDefinesFP(*ScalarDef, MRI, TRI))
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
+ else
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
+ break;
+ }
case TargetOpcode::G_TRUNC: {
LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128)
unsigned OpFlags) const;
// Optimization methods.
- bool tryOptVectorShuffle(MachineInstr &I) const;
- bool tryOptVectorDup(MachineInstr &MI) const;
bool tryOptSelect(MachineInstr &MI) const;
MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
MachineOperand &Predicate,
return &*CmpMI;
}
-bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const {
- // Try to match a vector splat operation into a dup instruction.
- // We're looking for this pattern:
- // %scalar:gpr(s64) = COPY $x0
- // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF
- // %cst0:gpr(s32) = G_CONSTANT i32 0
- // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)
- // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32)
- // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef,
- // %zerovec(<2 x s32>)
- //
- // ...into:
- // %splat = DUP %scalar
- // We use the regbank of the scalar to determine which kind of dup to use.
- MachineIRBuilder MIB(I);
- MachineRegisterInfo &MRI = *MIB.getMRI();
- const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
- using namespace TargetOpcode;
- using namespace MIPatternMatch;
-
- // Begin matching the insert.
- auto *InsMI =
- getOpcodeDef(G_INSERT_VECTOR_ELT, I.getOperand(1).getReg(), MRI);
- if (!InsMI)
- return false;
- // Match the undef vector operand.
- auto *UndefMI =
- getOpcodeDef(G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(), MRI);
- if (!UndefMI)
- return false;
- // Match the scalar being splatted.
- Register ScalarReg = InsMI->getOperand(2).getReg();
- const RegisterBank *ScalarRB = RBI.getRegBank(ScalarReg, MRI, TRI);
- // Match the index constant 0.
- int64_t Index = 0;
- if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index)
- return false;
-
- // The shuffle's second operand doesn't matter if the mask is all zero.
- ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
- if (!all_of(Mask, [](int Elem) { return Elem == 0; }))
- return false;
-
- // We're done, now find out what kind of splat we need.
- LLT VecTy = MRI.getType(I.getOperand(0).getReg());
- LLT EltTy = VecTy.getElementType();
- if (EltTy.getSizeInBits() < 32) {
- LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 32b elts yet");
- return false;
- }
- bool IsFP = ScalarRB->getID() == AArch64::FPRRegBankID;
- unsigned Opc = 0;
- if (IsFP) {
- switch (EltTy.getSizeInBits()) {
- case 32:
- if (VecTy.getNumElements() == 2) {
- Opc = AArch64::DUPv2i32lane;
- } else {
- Opc = AArch64::DUPv4i32lane;
- assert(VecTy.getNumElements() == 4);
- }
- break;
- case 64:
- assert(VecTy.getNumElements() == 2 && "Unexpected num elts");
- Opc = AArch64::DUPv2i64lane;
- break;
- }
- } else {
- switch (EltTy.getSizeInBits()) {
- case 32:
- if (VecTy.getNumElements() == 2) {
- Opc = AArch64::DUPv2i32gpr;
- } else {
- Opc = AArch64::DUPv4i32gpr;
- assert(VecTy.getNumElements() == 4);
- }
- break;
- case 64:
- assert(VecTy.getNumElements() == 2 && "Unexpected num elts");
- Opc = AArch64::DUPv2i64gpr;
- break;
- }
- }
- assert(Opc && "Did not compute an opcode for a dup");
-
- // For FP splats, we need to widen the scalar reg via undef too.
- if (IsFP) {
- MachineInstr *Widen = emitScalarToVector(
- EltTy.getSizeInBits(), &AArch64::FPR128RegClass, ScalarReg, MIB);
- if (!Widen)
- return false;
- ScalarReg = Widen->getOperand(0).getReg();
- }
- auto Dup = MIB.buildInstr(Opc, {I.getOperand(0).getReg()}, {ScalarReg});
- if (IsFP)
- Dup.addImm(0);
- constrainSelectedInstRegOperands(*Dup, TII, TRI, RBI);
- I.eraseFromParent();
- return true;
-}
-
-bool AArch64InstructionSelector::tryOptVectorShuffle(MachineInstr &I) const {
- if (TM.getOptLevel() == CodeGenOpt::None)
- return false;
- if (tryOptVectorDup(I))
- return true;
- return false;
-}
-
bool AArch64InstructionSelector::selectShuffleVector(
MachineInstr &I, MachineRegisterInfo &MRI) const {
- if (tryOptVectorShuffle(I))
- return true;
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
Register Src1Reg = I.getOperand(1).getReg();
const LLT Src1Ty = MRI.getType(Src1Reg);
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#define DEBUG_TYPE "aarch64-postlegalizer-combiner"
using namespace llvm;
+using namespace MIPatternMatch;
/// Represents a pseudo instruction which replaces a G_SHUFFLE_VECTOR.
///
ShuffleVectorPseudo() {}
};
+/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat.
+/// If \p MI is not a splat, returns None.
+static Optional<int> getSplatIndex(MachineInstr &MI) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
+ "Only G_SHUFFLE_VECTOR can have a splat index!");
+ ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+ auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; });
+
+ // If all elements are undefined, this shuffle can be considered a splat.
+ // Return 0 for better potential for callers to simplify.
+ if (FirstDefinedIdx == Mask.end())
+ return 0;
+
+ // Make sure all remaining elements are either undef or the same
+ // as the first non-undef value.
+ int SplatValue = *FirstDefinedIdx;
+ if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()),
+ [&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; }))
+ return None;
+
+ return SplatValue;
+}
+
/// Check if a vector shuffle corresponds to a REV instruction with the
/// specified blocksize.
static bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
return true;
}
+static bool matchDup(MachineInstr &MI, MachineRegisterInfo &MRI,
+ ShuffleVectorPseudo &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ auto Lane = getSplatIndex(MI);
+ if (!Lane || *Lane != 0)
+ return false;
+
+ // Try to match a vector splat operation into a dup instruction.
+ // We're looking for this pattern:
+ //
+ // %scalar:gpr(s64) = COPY $x0
+ // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF
+ // %cst0:gpr(s32) = G_CONSTANT i32 0
+ // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)
+ // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32)
+ // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, %zerovec(<2 x s32>)
+ //
+ // ...into:
+ // %splat = G_DUP %scalar
+
+ // Begin matching the insert.
+ auto *InsMI = getOpcodeDef(TargetOpcode::G_INSERT_VECTOR_ELT,
+ MI.getOperand(1).getReg(), MRI);
+ if (!InsMI)
+ return false;
+
+ // Match the undef vector operand.
+ if (!getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF,
+ InsMI->getOperand(1).getReg(), MRI))
+ return false;
+
+ // Match the index constant 0.
+ int64_t Index = 0;
+ if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index)
+ return false;
+
+ Register Dst = MI.getOperand(0).getReg();
+ if (MRI.getType(Dst).getScalarSizeInBits() < 32) {
+ LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 32b elts yet");
+ return false;
+ }
+
+ MatchInfo =
+ ShuffleVectorPseudo(AArch64::G_DUP, Dst, {InsMI->getOperand(2).getReg()});
+ return true;
+}
+
/// Replace a G_SHUFFLE_VECTOR instruction with a pseudo.
/// \p Opc is the opcode to use. \p MI is the G_SHUFFLE_VECTOR.
static bool applyShuffleVectorPseudo(MachineInstr &MI,
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: splat_4xi32
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: splat_4xi32
+ ; CHECK: liveins: $w0
+ ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+ ; CHECK: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32)
+ ; CHECK: $q0 = COPY [[DUP]](<4 x s32>)
+ ; CHECK: RET_ReallyLR implicit $q0
+ %0:gpr(s32) = COPY $w0
+ %2:fpr(<4 x s32>) = G_IMPLICIT_DEF
+ %3:gpr(s32) = G_CONSTANT i32 0
+ %1:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+ %4:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %1(<4 x s32>), %2, shufflemask(0, 0, 0, 0)
+ $q0 = COPY %4(<4 x s32>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+name: splat_2xi64
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $x0
+
+ ; CHECK-LABEL: name: splat_2xi64
+ ; CHECK: liveins: $x0
+ ; CHECK: [[COPY:%[0-9]+]]:gpr(s64) = COPY $x0
+ ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
+ ; CHECK: $q0 = COPY [[DUP]](<2 x s64>)
+ ; CHECK: RET_ReallyLR implicit $q0
+ %0:gpr(s64) = COPY $x0
+ %2:fpr(<2 x s64>) = G_IMPLICIT_DEF
+ %3:gpr(s32) = G_CONSTANT i32 0
+ %1:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s32)
+ %4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %1(<2 x s64>), %2, shufflemask(0, 0)
+ $q0 = COPY %4(<2 x s64>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+name: splat_2xi32
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: splat_2xi32
+ ; CHECK: liveins: $w0
+ ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+ ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s32>) = G_DUP [[COPY]](s32)
+ ; CHECK: $d0 = COPY [[DUP]](<2 x s32>)
+ ; CHECK: RET_ReallyLR implicit $d0
+ %0:gpr(s32) = COPY $w0
+ %2:fpr(<2 x s32>) = G_IMPLICIT_DEF
+ %3:gpr(s32) = G_CONSTANT i32 0
+ %1:fpr(<2 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+ %4:fpr(<2 x s32>) = G_SHUFFLE_VECTOR %1(<2 x s32>), %2, shufflemask(0, 0)
+ $d0 = COPY %4(<2 x s32>)
+ RET_ReallyLR implicit $d0
+
+...
+---
+name: splat_4xf32
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $s0
+
+ ; CHECK-LABEL: name: splat_4xf32
+ ; CHECK: liveins: $s0
+ ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0
+ ; CHECK: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32)
+ ; CHECK: $q0 = COPY [[DUP]](<4 x s32>)
+ ; CHECK: RET_ReallyLR implicit $q0
+ %0:fpr(s32) = COPY $s0
+ %2:fpr(<4 x s32>) = G_IMPLICIT_DEF
+ %3:gpr(s32) = G_CONSTANT i32 0
+ %1:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+ %4:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %1(<4 x s32>), %2, shufflemask(0, 0, 0, 0)
+ $q0 = COPY %4(<4 x s32>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+name: splat_2xf64
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $d0
+
+ ; CHECK-LABEL: name: splat_2xf64
+ ; CHECK: liveins: $d0
+ ; CHECK: [[COPY:%[0-9]+]]:fpr(s64) = COPY $d0
+ ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
+ ; CHECK: $q0 = COPY [[DUP]](<2 x s64>)
+ ; CHECK: RET_ReallyLR implicit $q0
+ %0:fpr(s64) = COPY $d0
+ %2:fpr(<2 x s64>) = G_IMPLICIT_DEF
+ %3:gpr(s32) = G_CONSTANT i32 0
+ %1:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s32)
+ %4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %1(<2 x s64>), %2, shufflemask(0, 0)
+ $q0 = COPY %4(<2 x s64>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+name: splat_2xf32
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $s0
+
+ ; CHECK-LABEL: name: splat_2xf32
+ ; CHECK: liveins: $s0
+ ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0
+ ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s32>) = G_DUP [[COPY]](s32)
+ ; CHECK: $d0 = COPY [[DUP]](<2 x s32>)
+ ; CHECK: RET_ReallyLR implicit $d0
+ %0:fpr(s32) = COPY $s0
+ %2:fpr(<2 x s32>) = G_IMPLICIT_DEF
+ %3:gpr(s32) = G_CONSTANT i32 0
+ %1:fpr(<2 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+ %4:fpr(<2 x s32>) = G_SHUFFLE_VECTOR %1(<2 x s32>), %2, shufflemask(0, 0)
+ $d0 = COPY %4(<2 x s32>)
+ RET_ReallyLR implicit $d0
+
+...
+---
+name: splat_2xf64_copies
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $d0
+
+ ; This test is exactly the same as splat_2xf64, except it adds two copies.
+ ; These copies shouldn't get in the way of matching the dup pattern.
+ ; CHECK-LABEL: name: splat_2xf64_copies
+ ; CHECK: liveins: $d0
+ ; CHECK: [[COPY:%[0-9]+]]:fpr(s64) = COPY $d0
+ ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
+ ; CHECK: $q0 = COPY [[DUP]](<2 x s64>)
+ ; CHECK: RET_ReallyLR implicit $q0
+ %0:fpr(s64) = COPY $d0
+ %2:fpr(<2 x s64>) = G_IMPLICIT_DEF
+ %6:fpr(<2 x s64>) = COPY %2
+ %3:gpr(s32) = G_CONSTANT i32 0
+ %1:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %6, %0(s64), %3(s32)
+ %7:fpr(<2 x s64>) = COPY %1
+ %4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %7(<2 x s64>), %2, shufflemask(0, 0)
+ $q0 = COPY %4(<2 x s64>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+name: not_all_zeros
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $x0
+ ; Make sure that we don't do the optimization when it's not all zeroes.
+ ; CHECK-LABEL: name: not_all_zeros
+ ; CHECK: liveins: $x0
+ ; CHECK: [[COPY:%[0-9]+]]:gpr(s64) = COPY $x0
+ ; CHECK: [[DEF:%[0-9]+]]:fpr(<2 x s64>) = G_IMPLICIT_DEF
+ ; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
+ ; CHECK: [[IVEC:%[0-9]+]]:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s32)
+ ; CHECK: [[SHUF:%[0-9]+]]:fpr(<2 x s64>) = G_SHUFFLE_VECTOR [[IVEC]](<2 x s64>), [[DEF]], shufflemask(0, 1)
+ ; CHECK: $q0 = COPY [[SHUF]](<2 x s64>)
+ ; CHECK: RET_ReallyLR implicit $q0
+ %0:gpr(s64) = COPY $x0
+ %2:fpr(<2 x s64>) = G_IMPLICIT_DEF
+ %3:gpr(s32) = G_CONSTANT i32 0
+ %1:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s32)
+ %4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %1(<2 x s64>), %2, shufflemask(0, 1)
+ $q0 = COPY %4(<2 x s64>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+name: all_undef
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $x0
+ ; If all the elements are undefined, we consider it a splat. In this case,
+ ; we can choose 0 as our index.
+ ;
+ ; We should get a G_DUP here.
+ ;
+ ; CHECK-LABEL: name: all_undef
+ ; CHECK: liveins: $x0
+ ; CHECK: [[COPY:%[0-9]+]]:gpr(s64) = COPY $x0
+ ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
+ ; CHECK: $q0 = COPY [[DUP]](<2 x s64>)
+ ; CHECK: RET_ReallyLR implicit $q0
+ %0:gpr(s64) = COPY $x0
+ %2:fpr(<2 x s64>) = G_IMPLICIT_DEF
+ %3:gpr(s32) = G_CONSTANT i32 0
+ %1:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s32)
+ %4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %1(<2 x s64>), %2, shufflemask(-1, -1)
+ $q0 = COPY %4(<2 x s64>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+name: one_undef
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $s0
+ ; Make sure we can skip past undef values.
+ ;
+ ; We should get a G_DUP here.
+ ;
+ ; CHECK-LABEL: name: one_undef
+ ; CHECK: liveins: $s0
+ ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0
+ ; CHECK: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32)
+ ; CHECK: $q0 = COPY [[DUP]](<4 x s32>)
+ ; CHECK: RET_ReallyLR implicit $q0
+ %0:fpr(s32) = COPY $s0
+ %2:fpr(<4 x s32>) = G_IMPLICIT_DEF
+ %3:gpr(s32) = G_CONSTANT i32 0
+ %1:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+ %4:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %1(<4 x s32>), %2, shufflemask(0, -1, 0, 0)
+ $q0 = COPY %4(<4 x s32>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+name: not_all_zeros_with_undefs
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $s0
+ ; Check a non-splat mask with an undef value. We shouldn't get a G_DUP here.
+ ;
+ ; CHECK-LABEL: name: not_all_zeros_with_undefs
+ ; CHECK: liveins: $s0
+ ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0
+ ; CHECK: [[DEF:%[0-9]+]]:fpr(<4 x s32>) = G_IMPLICIT_DEF
+ ; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
+ ; CHECK: [[IVEC:%[0-9]+]]:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32)
+ ; CHECK: [[SHUF:%[0-9]+]]:fpr(<4 x s32>) = G_SHUFFLE_VECTOR [[IVEC]](<4 x s32>), [[DEF]], shufflemask(undef, 0, 0, 3)
+ ; CHECK: $q0 = COPY [[SHUF]](<4 x s32>)
+ ; CHECK: RET_ReallyLR implicit $q0
+ %0:fpr(s32) = COPY $s0
+ %2:fpr(<4 x s32>) = G_IMPLICIT_DEF
+ %3:gpr(s32) = G_CONSTANT i32 0
+ %1:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+ %4:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %1(<4 x s32>), %2, shufflemask(-1, 0, 0, 3)
+ $q0 = COPY %4(<4 x s32>)
+ RET_ReallyLR implicit $q0
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass=regbankselect -verify-machineinstrs %s -o - | FileCheck %s
+#
+# Verify register banks for G_DUP.
+#
+
+...
+---
+name: v4s32_gpr
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: v4s32_gpr
+ ; CHECK: liveins: $w0
+ ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+ ; CHECK: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32)
+ ; CHECK: $q0 = COPY [[DUP]](<4 x s32>)
+ ; CHECK: RET_ReallyLR implicit $q0
+ %0:_(s32) = COPY $w0
+ %4:_(<4 x s32>) = G_DUP %0(s32)
+ $q0 = COPY %4(<4 x s32>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+name: v4s64_gpr
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x0
+
+ ; CHECK-LABEL: name: v4s64_gpr
+ ; CHECK: liveins: $x0
+ ; CHECK: [[COPY:%[0-9]+]]:gpr(s64) = COPY $x0
+ ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
+ ; CHECK: $q0 = COPY [[DUP]](<2 x s64>)
+ ; CHECK: RET_ReallyLR implicit $q0
+ %0:_(s64) = COPY $x0
+ %4:_(<2 x s64>) = G_DUP %0(s64)
+ $q0 = COPY %4(<2 x s64>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+name: v2s32_gpr
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: v2s32_gpr
+ ; CHECK: liveins: $w0
+ ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+ ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s32>) = G_DUP [[COPY]](s32)
+ ; CHECK: $d0 = COPY [[DUP]](<2 x s32>)
+ ; CHECK: RET_ReallyLR implicit $d0
+ %0:_(s32) = COPY $w0
+ %4:_(<2 x s32>) = G_DUP %0(s32)
+ $d0 = COPY %4(<2 x s32>)
+ RET_ReallyLR implicit $d0
+
+...
+---
+name: v4s32_fpr
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $s0
+
+ ; CHECK-LABEL: name: v4s32_fpr
+ ; CHECK: liveins: $s0
+ ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0
+ ; CHECK: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32)
+ ; CHECK: $q0 = COPY [[DUP]](<4 x s32>)
+ ; CHECK: RET_ReallyLR implicit $q0
+ %0:_(s32) = COPY $s0
+ %4:_(<4 x s32>) = G_DUP %0(s32)
+ $q0 = COPY %4(<4 x s32>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+name: v2s64_fpr
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $d0
+
+ ; CHECK-LABEL: name: v2s64_fpr
+ ; CHECK: liveins: $d0
+ ; CHECK: [[COPY:%[0-9]+]]:fpr(s64) = COPY $d0
+ ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
+ ; CHECK: $q0 = COPY [[DUP]](<2 x s64>)
+ ; CHECK: RET_ReallyLR implicit $q0
+ %0:_(s64) = COPY $d0
+ %4:_(<2 x s64>) = G_DUP %0(s64)
+ $q0 = COPY %4(<2 x s64>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+name: v2s32_fpr
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $s0
+
+ ; CHECK-LABEL: name: v2s32_fpr
+ ; CHECK: liveins: $s0
+ ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0
+ ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s32>) = G_DUP [[COPY]](s32)
+ ; CHECK: $d0 = COPY [[DUP]](<2 x s32>)
+ ; CHECK: RET_ReallyLR implicit $d0
+ %0:_(s32) = COPY $s0
+ %4:_(<2 x s32>) = G_DUP %0(s32)
+ $d0 = COPY %4(<2 x s32>)
+ RET_ReallyLR implicit $d0
+
+...
+---
+name: v2s64_fpr_copy
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $d0
+
+ ; CHECK-LABEL: name: v2s64_fpr_copy
+ ; CHECK: liveins: $d0
+ ; CHECK: [[COPY:%[0-9]+]]:fpr(s64) = COPY $d0
+ ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
+ ; CHECK: $q0 = COPY [[DUP]](<2 x s64>)
+ ; CHECK: RET_ReallyLR implicit $q0
+ %0:_(s64) = COPY $d0
+ %6:_(<2 x s64>) = G_DUP %0(s64)
+ $q0 = COPY %6(<2 x s64>)
+ RET_ReallyLR implicit $q0
+
+...
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -O1 -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+
+...
---
name: splat_4xi32
alignment: 4
regBankSelected: true
tracksRegLiveness: true
body: |
- bb.1.entry:
+ bb.0.entry:
liveins: $w0
; CHECK-LABEL: name: splat_4xi32
; CHECK: $q0 = COPY [[DUPv4i32gpr]]
; CHECK: RET_ReallyLR implicit $q0
%0:gpr(s32) = COPY $w0
- %2:fpr(<4 x s32>) = G_IMPLICIT_DEF
- %3:gpr(s32) = G_CONSTANT i32 0
- %1:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
- %4:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %1(<4 x s32>), %2, shufflemask(0, 0, 0, 0)
+ %4:fpr(<4 x s32>) = G_DUP %0(s32)
$q0 = COPY %4(<4 x s32>)
RET_ReallyLR implicit $q0
regBankSelected: true
tracksRegLiveness: true
body: |
- bb.1.entry:
+ bb.0.entry:
liveins: $x0
; CHECK-LABEL: name: splat_2xi64
; CHECK: $q0 = COPY [[DUPv2i64gpr]]
; CHECK: RET_ReallyLR implicit $q0
%0:gpr(s64) = COPY $x0
- %2:fpr(<2 x s64>) = G_IMPLICIT_DEF
- %3:gpr(s32) = G_CONSTANT i32 0
- %1:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s32)
- %4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %1(<2 x s64>), %2, shufflemask(0, 0)
+ %4:fpr(<2 x s64>) = G_DUP %0(s64)
$q0 = COPY %4(<2 x s64>)
RET_ReallyLR implicit $q0
regBankSelected: true
tracksRegLiveness: true
body: |
- bb.1.entry:
+ bb.0.entry:
liveins: $w0
; CHECK-LABEL: name: splat_2xi32
; CHECK: $d0 = COPY [[DUPv2i32gpr]]
; CHECK: RET_ReallyLR implicit $d0
%0:gpr(s32) = COPY $w0
- %2:fpr(<2 x s32>) = G_IMPLICIT_DEF
- %3:gpr(s32) = G_CONSTANT i32 0
- %1:fpr(<2 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
- %4:fpr(<2 x s32>) = G_SHUFFLE_VECTOR %1(<2 x s32>), %2, shufflemask(0, 0)
+ %4:fpr(<2 x s32>) = G_DUP %0(s32)
$d0 = COPY %4(<2 x s32>)
RET_ReallyLR implicit $d0
regBankSelected: true
tracksRegLiveness: true
body: |
- bb.1.entry:
+ bb.0.entry:
liveins: $s0
; CHECK-LABEL: name: splat_4xf32
; CHECK: $q0 = COPY [[DUPv4i32lane]]
; CHECK: RET_ReallyLR implicit $q0
%0:fpr(s32) = COPY $s0
- %2:fpr(<4 x s32>) = G_IMPLICIT_DEF
- %3:gpr(s32) = G_CONSTANT i32 0
- %1:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
- %4:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %1(<4 x s32>), %2, shufflemask(0, 0, 0, 0)
+ %4:fpr(<4 x s32>) = G_DUP %0(s32)
$q0 = COPY %4(<4 x s32>)
RET_ReallyLR implicit $q0
regBankSelected: true
tracksRegLiveness: true
body: |
- bb.1.entry:
+ bb.0.entry:
liveins: $d0
; CHECK-LABEL: name: splat_2xf64
; CHECK: $q0 = COPY [[DUPv2i64lane]]
; CHECK: RET_ReallyLR implicit $q0
%0:fpr(s64) = COPY $d0
- %2:fpr(<2 x s64>) = G_IMPLICIT_DEF
- %3:gpr(s32) = G_CONSTANT i32 0
- %1:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s32)
- %4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %1(<2 x s64>), %2, shufflemask(0, 0)
+ %4:fpr(<2 x s64>) = G_DUP %0(s64)
$q0 = COPY %4(<2 x s64>)
RET_ReallyLR implicit $q0
regBankSelected: true
tracksRegLiveness: true
body: |
- bb.1.entry:
+ bb.0.entry:
liveins: $s0
; CHECK-LABEL: name: splat_2xf32
; CHECK: $d0 = COPY [[DUPv2i32lane]]
; CHECK: RET_ReallyLR implicit $d0
%0:fpr(s32) = COPY $s0
- %2:fpr(<2 x s32>) = G_IMPLICIT_DEF
- %3:gpr(s32) = G_CONSTANT i32 0
- %1:fpr(<2 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
- %4:fpr(<2 x s32>) = G_SHUFFLE_VECTOR %1(<2 x s32>), %2, shufflemask(0, 0)
+ %4:fpr(<2 x s32>) = G_DUP %0(s32)
$d0 = COPY %4(<2 x s32>)
RET_ReallyLR implicit $d0
regBankSelected: true
tracksRegLiveness: true
body: |
- bb.1.entry:
+ bb.0.entry:
liveins: $d0
- ; This test is exactly the same as splat_2xf64, except it adds two copies.
- ; These copies shouldn't get in the way of matching the dup pattern.
; CHECK-LABEL: name: splat_2xf64_copies
; CHECK: liveins: $d0
; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
; CHECK: $q0 = COPY [[DUPv2i64lane]]
; CHECK: RET_ReallyLR implicit $q0
%0:fpr(s64) = COPY $d0
- %2:fpr(<2 x s64>) = G_IMPLICIT_DEF
- %6:fpr(<2 x s64>) = COPY %2
- %3:gpr(s32) = G_CONSTANT i32 0
- %1:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %6, %0(s64), %3(s32)
- %7:fpr(<2 x s64>) = COPY %1
- %4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %7(<2 x s64>), %2, shufflemask(0, 0)
- $q0 = COPY %4(<2 x s64>)
- RET_ReallyLR implicit $q0
-
-...
----
-name: not_all_zeros
-alignment: 4
-legalized: true
-regBankSelected: true
-tracksRegLiveness: true
-body: |
- bb.1.entry:
- liveins: $x0
- ; Make sure that we don't do the optimization when it's not all zeroes.
- ; CHECK-LABEL: name: not_all_zeros
- ; CHECK: liveins: $x0
- ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
- ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
- ; CHECK: [[INSvi64gpr:%[0-9]+]]:fpr128 = INSvi64gpr [[DEF]], 0, [[COPY]]
- ; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0
- ; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0
- ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:qq = REG_SEQUENCE [[INSvi64gpr]], %subreg.qsub0, [[DEF]], %subreg.qsub1
- ; CHECK: [[TBLv16i8Two:%[0-9]+]]:fpr128 = TBLv16i8Two [[REG_SEQUENCE]], [[LDRQui]]
- ; CHECK: $q0 = COPY [[TBLv16i8Two]]
- ; CHECK: RET_ReallyLR implicit $q0
- %0:gpr(s64) = COPY $x0
- %2:fpr(<2 x s64>) = G_IMPLICIT_DEF
- %3:gpr(s32) = G_CONSTANT i32 0
- %1:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s32)
- %4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %1(<2 x s64>), %2, shufflemask(0, 1)
- $q0 = COPY %4(<2 x s64>)
+ %6:fpr(<2 x s64>) = G_DUP %0(s64)
+ $q0 = COPY %6(<2 x s64>)
RET_ReallyLR implicit $q0