Consecutive load matching (EltsFromConsecutiveLoads) currently uses VZEXT_LOAD (load scalar into lowest element and zero uppers) for vXi64 / vXf64 vectors only.
For vXi32 / vXf32 vectors it instead creates a scalar load, SCALAR_TO_VECTOR and finally VZEXT_MOVL (zero upper vector elements), relying on tablegen patterns to match this into an equivalent of VZEXT_LOAD.
This patch adds the VZEXT_LOAD patterns for vXi32 / vXf32 vectors directly and updates EltsFromConsecutiveLoads to use this.
This has proven necessary to allow us to easily make VZEXT_MOVL a full member of the target shuffle set - without this change the call to combineShuffle (which is the main caller of EltsFromConsecutiveLoads) tended to recursively recreate VZEXT_MOVL nodes......
Differential Revision: https://reviews.llvm.org/D23673
llvm-svn: 279619
int LoadSize =
(1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
- // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs.
- if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 &&
+ // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
+ if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
+ (LoadSize == 32 || LoadSize == 64) &&
((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
- MVT VecSVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
- MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 64);
+ MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
+ : MVT::getIntegerVT(LoadSize);
+ MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
if (TLI.isTypeLegal(VecVT)) {
SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
}
}
- // VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs.
- if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 &&
- ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
- MVT VecSVT = VT.isFloatingPoint() ? MVT::f32 : MVT::i32;
- MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 32);
- if (TLI.isTypeLegal(VecVT)) {
- SDValue V = LastLoadedElt != 0 ? CreateLoad(VecSVT, LDBase)
- : DAG.getBitcast(VecSVT, EltBase);
- V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V);
- V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V);
- return DAG.getBitcast(VT, V);
- }
- }
-
return SDValue();
}
(COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
(COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+ def : Pat<(v4f32 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
// MOVSDrm zeros the high parts of the register; represent this
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
(v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+ def : Pat<(v8f32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
def : Pat<(v16f32 (X86vzmovl (insert_subvector undef,
(v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+ def : Pat<(v16f32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
def : Pat<(v8f64 (X86vzmovl (insert_subvector undef,
(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
let AddedComplexity = 20 in {
def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
(VMOVDI2PDIZrm addr:$src)>;
-
def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
(VMOVDI2PDIZrm addr:$src)>;
def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
(VMOVDI2PDIZrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzload addr:$src)),
+ (VMOVDI2PDIZrm addr:$src)>;
+ def : Pat<(v8i32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
- (VMOVZPQILo2PQIZrm addr:$src)>;
+ (VMOVZPQILo2PQIZrm addr:$src)>;
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
- (VMOVZPQILo2PQIZrr VR128X:$src)>;
+ (VMOVZPQILo2PQIZrr VR128X:$src)>;
def : Pat<(v2i64 (X86vzload addr:$src)),
- (VMOVZPQILo2PQIZrm addr:$src)>;
+ (VMOVZPQILo2PQIZrm addr:$src)>;
def : Pat<(v4i64 (X86vzload addr:$src)),
(SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>;
}
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
-
def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
+ def : Pat<(v16i32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
def : Pat<(v8i64 (X86vzload addr:$src)),
(SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>;
}
(COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
(COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
// MOVSDrm zeros the high parts of the register; represent this
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
(v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
+ def : Pat<(v8f32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
(COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
(COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
}
// Extract and store.
(VMOVDI2PDIrm addr:$src)>;
def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
(VMOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzload addr:$src)),
+ (VMOVDI2PDIrm addr:$src)>;
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
(v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
+ def : Pat<(v8i32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i64 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
}
// Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
(MOVDI2PDIrm addr:$src)>;
def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
(MOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzload addr:$src)),
+ (MOVDI2PDIrm addr:$src)>;
}
}