From: Simon Pilgrim Date: Fri, 18 Dec 2020 15:19:43 +0000 (+0000) Subject: [X86][AVX] Remove X86ISD::SUBV_BROADCAST (PR38969) X-Git-Tag: llvmorg-13-init~2960 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8767f3bb972f5122ad2e914df13470219b68bac4;p=platform%2Fupstream%2Fllvm.git [X86][AVX] Remove X86ISD::SUBV_BROADCAST (PR38969) Followup to D92645 - remove the remaining places where we create X86ISD::SUBV_BROADCAST, and fold splatted vector loads to X86ISD::SUBV_BROADCAST_LOAD instead. Remove all the X86SubVBroadcast isel patterns, including all the fallbacks for if memory folding failed. --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d4aa97a..a8e18e0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8601,6 +8601,11 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, if (!Subtarget.hasAVX2() && ScalarSize < 32) continue; + // Don't attempt a 1:N subvector broadcast - it should be caught by + // combineConcatVectorOps, else will cause infinite loops. + if (RepeatSize > ScalarSize && SubElems == 1) + continue; + bool Match = true; SmallVector RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT)); for (unsigned i = 0; i != NumElems && Match; ++i) { @@ -8632,9 +8637,14 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, if (TLI.isTypeLegal(BroadcastVT)) { if (SDValue RepeatLoad = EltsFromConsecutiveLoads( RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) { - unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST - : X86ISD::VBROADCAST; - SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad); + SDValue Broadcast = RepeatLoad; + if (RepeatSize > ScalarSize) { + while (Broadcast.getValueSizeInBits() < VT.getSizeInBits()) + Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL); + } else { + Broadcast = + DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad); + } return DAG.getBitcast(VT, Broadcast); } } @@ -30945,7 +30955,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VBROADCAST) NODE_NAME_CASE(VBROADCAST_LOAD) NODE_NAME_CASE(VBROADCASTM) - NODE_NAME_CASE(SUBV_BROADCAST) NODE_NAME_CASE(SUBV_BROADCAST_LOAD) NODE_NAME_CASE(VPERMILPV) NODE_NAME_CASE(VPERMILPI) @@ -38088,20 +38097,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( TLO.DAG, DL, ExtSizeInBits)); } // Subvector broadcast. - case X86ISD::SUBV_BROADCAST: { - SDLoc DL(Op); - SDValue Src = Op.getOperand(0); - if (Src.getValueSizeInBits() > ExtSizeInBits) - Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits); - else if (Src.getValueSizeInBits() < ExtSizeInBits) { - MVT SrcSVT = Src.getSimpleValueType().getScalarType(); - MVT SrcVT = - MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits()); - Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src); - } - return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0, - TLO.DAG, DL, ExtSizeInBits)); - } case X86ISD::SUBV_BROADCAST_LOAD: { auto *MemIntr = cast(Op); EVT MemVT = MemIntr->getMemoryVT(); @@ -48873,41 +48868,47 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, SDValue Op0 = Ops[0]; bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; }); - // Fold subvector loads into one. - // If needed, look through bitcasts to get to the load. - if (auto *FirstLd = dyn_cast(peekThroughBitcasts(Op0))) { - bool Fast; - const X86TargetLowering *TLI = Subtarget.getTargetLowering(); - if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - *FirstLd->getMemOperand(), &Fast) && - Fast) { - if (SDValue Ld = - EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false)) - return Ld; - } - } - // Repeated subvectors. - if (IsSplat) { + if (IsSplat && + (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) { // If this broadcast is inserted into both halves, use a larger broadcast. if (Op0.getOpcode() == X86ISD::VBROADCAST) return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0)); - // If this broadcast_load is inserted into both halves, use a larger - // broadcast_load. Update other uses to use an extracted subvector. - if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD) { + // If this scalar/subvector broadcast_load is inserted into both halves, use + // a larger broadcast_load. Update other uses to use an extracted subvector. + if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD || + Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) { auto *MemIntr = cast(Op0); SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()}; - SDValue BcastLd = DAG.getMemIntrinsicNode( - X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(), - MemIntr->getMemOperand()); + SDValue BcastLd = DAG.getMemIntrinsicNode(Op0.getOpcode(), DL, Tys, Ops, + MemIntr->getMemoryVT(), + MemIntr->getMemOperand()); DAG.ReplaceAllUsesOfValueWith( Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits())); DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1)); return BcastLd; } + // If this is a simple subvector load repeated across multiple lanes, then + // broadcast the load. Update other uses to use an extracted subvector. + if (auto *Ld = dyn_cast(Op0)) { + if (Ld->isSimple() && !Ld->isNonTemporal() && + Ld->getExtensionType() == ISD::NON_EXTLOAD) { + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()}; + SDValue BcastLd = + DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops, + Ld->getMemoryVT(), Ld->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith( + Op0, + extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits())); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1)); + return BcastLd; + } + } + // concat_vectors(movddup(x),movddup(x)) -> broadcast(x) if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 && (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0)))) @@ -49110,6 +49111,20 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } } + // Fold subvector loads into one. + // If needed, look through bitcasts to get to the load. + if (auto *FirstLd = dyn_cast(peekThroughBitcasts(Op0))) { + bool Fast; + const X86TargetLowering *TLI = Subtarget.getTargetLowering(); + if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + *FirstLd->getMemOperand(), &Fast) && + Fast) { + if (SDValue Ld = + EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false)) + return Ld; + } + } + return SDValue(); } @@ -49399,10 +49414,10 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, InVec.getOpcode() == X86ISD::VBROADCAST_LOAD)) return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits); - // If we're extracting a broadcasted subvector, just use the source. - if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST && - InVec.getOperand(0).getValueType() == VT) - return InVec.getOperand(0); + // If we're extracting a broadcasted subvector, just use the lowest subvector. + if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && + cast(InVec)->getMemoryVT() == VT) + return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits); // Attempt to extract from the source of a shuffle vector. if ((InSizeInBits % SizeInBits) == 0 && diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index e3f9ce8..faf2cc6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -505,8 +505,6 @@ namespace llvm { VBROADCAST, // Broadcast mask to vector. VBROADCASTM, - // Broadcast subvector to vector. - SUBV_BROADCAST, /// SSE4A Extraction and Insertion. EXTRQI, diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index c6367a0..2a303dc 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -1414,11 +1414,12 @@ defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq", avx512vl_i64_info, HasAVX512, 1>, VEX_W1X; multiclass avx512_subvec_broadcast_rm opc, string OpcodeStr, - X86VectorVTInfo _Dst, X86VectorVTInfo _Src> { + SDPatternOperator OpNode, + X86VectorVTInfo _Dst, + X86VectorVTInfo _Src> { defm rm : AVX512_maskable, + (_Dst.VT (OpNode addr:$src))>, Sched<[SchedWriteShuffle.YMM.Folded]>, AVX5128IBase, EVEX; } @@ -1427,13 +1428,14 @@ multiclass avx512_subvec_broadcast_rm opc, string OpcodeStr, // the unmasked patterns so that we only use the DQ instructions when masking // is requested. multiclass avx512_subvec_broadcast_rm_dq opc, string OpcodeStr, - X86VectorVTInfo _Dst, X86VectorVTInfo _Src> { + SDPatternOperator OpNode, + X86VectorVTInfo _Dst, + X86VectorVTInfo _Src> { let hasSideEffects = 0, mayLoad = 1 in defm rm : AVX512_maskable_split, + (_Dst.VT (OpNode addr:$src))>, Sched<[SchedWriteShuffle.YMM.Folded]>, AVX5128IBase, EVEX; } @@ -1443,16 +1445,16 @@ multiclass avx512_subvec_broadcast_rm_dq opc, string OpcodeStr, // defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", - v16i32_info, v4i32x_info>, + X86SubVBroadcastld128, v16i32_info, v4i32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT4>; defm VBROADCASTF32X4 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4", - v16f32_info, v4f32x_info>, + X86SubVBroadcastld128, v16f32_info, v4f32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT4>; defm VBROADCASTI64X4 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti64x4", - v8i64_info, v4i64x_info>, VEX_W, + X86SubVBroadcastld256, v8i64_info, v4i64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>; defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4", - v8f64_info, v4f64x_info>, VEX_W, + X86SubVBroadcastld256, v8f64_info, v4f64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>; let Predicates = [HasAVX512] in { @@ -1482,87 +1484,48 @@ def : Pat<(v32i16 (X86SubVBroadcastld128 addr:$src)), def : Pat<(v64i8 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTI32X4rm addr:$src)>; -def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))), - (VBROADCASTF64X4rm addr:$src)>; -def : Pat<(v16i32 (X86SubVBroadcast (loadv8i32 addr:$src))), - (VBROADCASTI64X4rm addr:$src)>; -def : Pat<(v32i16 (X86SubVBroadcast (loadv16i16 addr:$src))), - (VBROADCASTI64X4rm addr:$src)>; -def : Pat<(v64i8 (X86SubVBroadcast (loadv32i8 addr:$src))), - (VBROADCASTI64X4rm addr:$src)>; - -// Provide fallback in case the load node that is used in the patterns above -// is used by additional users, which prevents the pattern selection. -def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))), - (VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v4f64 VR256X:$src), 1)>; -def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), - (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v8f32 VR256X:$src), 1)>; -def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))), - (VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v4i64 VR256X:$src), 1)>; -def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), - (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v8i32 VR256X:$src), 1)>; -def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))), - (VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v16i16 VR256X:$src), 1)>; -def : Pat<(v64i8 (X86SubVBroadcast (v32i8 VR256X:$src))), - (VINSERTI64x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v32i8 VR256X:$src), 1)>; - -def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))), - (VBROADCASTF32X4rm addr:$src)>; -def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))), - (VBROADCASTI32X4rm addr:$src)>; -def : Pat<(v32i16 (X86SubVBroadcast (loadv8i16 addr:$src))), - (VBROADCASTI32X4rm addr:$src)>; -def : Pat<(v64i8 (X86SubVBroadcast (loadv16i8 addr:$src))), - (VBROADCASTI32X4rm addr:$src)>; - // Patterns for selects of bitcasted operations. def : Pat<(vselect_mask VK16WM:$mask, - (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), + (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))), (v16f32 immAllZerosV)), (VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, - (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), + (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))), VR512:$src0), (VBROADCASTF32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, - (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), + (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))), (v16i32 immAllZerosV)), (VBROADCASTI32X4rmkz VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, - (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), + (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))), VR512:$src0), (VBROADCASTI32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))), + (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))), (v8f64 immAllZerosV)), (VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))), + (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))), VR512:$src0), (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))), + (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))), (v8i64 immAllZerosV)), (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))), + (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))), VR512:$src0), (VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>; } let Predicates = [HasVLX] in { defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", - v8i32x_info, v4i32x_info>, + X86SubVBroadcastld128, v8i32x_info, v4i32x_info>, EVEX_V256, EVEX_CD8<32, CD8VT4>; defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4", - v8f32x_info, v4f32x_info>, + X86SubVBroadcastld128, v8f32x_info, v4f32x_info>, EVEX_V256, EVEX_CD8<32, CD8VT4>; def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)), @@ -1578,129 +1541,98 @@ def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)), def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTI32X4Z256rm addr:$src)>; -def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), - (VBROADCASTF32X4Z256rm addr:$src)>; -def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), - (VBROADCASTI32X4Z256rm addr:$src)>; -def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))), - (VBROADCASTI32X4Z256rm addr:$src)>; -def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))), - (VBROADCASTI32X4Z256rm addr:$src)>; - // Patterns for selects of bitcasted operations. def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), + (bc_v8f32 (v4f64 (X86SubVBroadcastld128 addr:$src))), (v8f32 immAllZerosV)), (VBROADCASTF32X4Z256rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), + (bc_v8f32 (v4f64 (X86SubVBroadcastld128 addr:$src))), VR256X:$src0), (VBROADCASTF32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), + (bc_v8i32 (v4i64 (X86SubVBroadcastld128 addr:$src))), (v8i32 immAllZerosV)), (VBROADCASTI32X4Z256rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), + (bc_v8i32 (v4i64 (X86SubVBroadcastld128 addr:$src))), VR256X:$src0), (VBROADCASTI32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>; - - -// Provide fallback in case the load node that is used in the patterns above -// is used by additional users, which prevents the pattern selection. -def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))), - (VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v2f64 VR128X:$src), 1)>; -def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128X:$src))), - (VINSERTF32x4Z256rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v4f32 VR128X:$src), 1)>; -def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))), - (VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v2i64 VR128X:$src), 1)>; -def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128X:$src))), - (VINSERTI32x4Z256rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v4i32 VR128X:$src), 1)>; -def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128X:$src))), - (VINSERTI32x4Z256rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v8i16 VR128X:$src), 1)>; -def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128X:$src))), - (VINSERTI32x4Z256rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v16i8 VR128X:$src), 1)>; } let Predicates = [HasVLX, HasDQI] in { defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", - v4i64x_info, v2i64x_info>, VEX_W1X, + X86SubVBroadcastld128, v4i64x_info, v2i64x_info>, VEX_W1X, EVEX_V256, EVEX_CD8<64, CD8VT2>; defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", - v4f64x_info, v2f64x_info>, VEX_W1X, + X86SubVBroadcastld128, v4f64x_info, v2f64x_info>, VEX_W1X, EVEX_V256, EVEX_CD8<64, CD8VT2>; // Patterns for selects of bitcasted operations. def : Pat<(vselect_mask VK4WM:$mask, - (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), + (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))), (v4f64 immAllZerosV)), (VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK4WM:$mask, - (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), + (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))), VR256X:$src0), (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK4WM:$mask, - (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), + (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))), (v4i64 immAllZerosV)), (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK4WM:$mask, - (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), + (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))), VR256X:$src0), (VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; } let Predicates = [HasDQI] in { defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", - v8i64_info, v2i64x_info>, VEX_W, + X86SubVBroadcastld128, v8i64_info, v2i64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm_dq<0x5b, "vbroadcasti32x8", - v16i32_info, v8i32x_info>, + X86SubVBroadcastld256, v16i32_info, v8i32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT8>; defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", - v8f64_info, v2f64x_info>, VEX_W, + X86SubVBroadcastld128, v8f64_info, v2f64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8", - v16f32_info, v8f32x_info>, + X86SubVBroadcastld256, v16f32_info, v8f32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT8>; // Patterns for selects of bitcasted operations. def : Pat<(vselect_mask VK16WM:$mask, - (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))), + (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))), (v16f32 immAllZerosV)), (VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, - (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))), + (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))), VR512:$src0), (VBROADCASTF32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, - (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))), + (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))), (v16i32 immAllZerosV)), (VBROADCASTI32X8rmkz VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, - (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))), + (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))), VR512:$src0), (VBROADCASTI32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), + (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))), (v8f64 immAllZerosV)), (VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), + (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))), VR512:$src0), (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), + (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))), (v8i64 immAllZerosV)), (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), + (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))), VR512:$src0), (VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>; } @@ -10518,39 +10450,6 @@ defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", WriteFShuffle256, defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", WriteFShuffle256, avx512vl_i64_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; -let Predicates = [HasAVX512] in { -// Provide fallback in case the load node that is used in the broadcast -// patterns above is used by additional users, which prevents the pattern -// selection. -def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))), - (VSHUFF64X2Zrri (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - 0)>; -def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))), - (VSHUFI64X2Zrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - 0)>; - -def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))), - (VSHUFF32X4Zrri (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - 0)>; -def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))), - (VSHUFI32X4Zrri (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - 0)>; - -def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))), - (VSHUFI32X4Zrri (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - 0)>; - -def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))), - (VSHUFI32X4Zrri (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - 0)>; -} - multiclass avx512_valign opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _>{ // NOTE: EVEX2VEXOverride changed back to Unset for 256-bit at the diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 98380b4..9988395 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -495,10 +495,6 @@ def X86Vfpclasss : SDNode<"X86ISD::VFPCLASSS", SDTypeProfile<1, 2, [SDTCisVT<0, v1i1>, SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>; -def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST", - SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisSubVecOfVec<1, 0>]>, []>; - def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index d4fdac0..071c638 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -7020,16 +7020,8 @@ def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; - -def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), - (VBROADCASTF128 addr:$src)>; -def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))), - (VBROADCASTF128 addr:$src)>; -} - // NOTE: We're using FP instructions here, but execution domain fixing can // convert to integer when profitable. -let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)), @@ -7038,15 +7030,6 @@ def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; - -def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), - (VBROADCASTF128 addr:$src)>; -def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))), - (VBROADCASTF128 addr:$src)>; -def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))), - (VBROADCASTF128 addr:$src)>; -def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))), - (VBROADCASTF128 addr:$src)>; } //===----------------------------------------------------------------------===// @@ -7846,37 +7829,6 @@ let Predicates = [HasAVX2] in { } //===----------------------------------------------------------------------===// -// SubVector Broadcasts -// Provide fallback in case the load node that is used in the patterns above -// is used by additional users, which prevents the pattern selection. - -let Predicates = [HasAVX, NoVLX] in { -def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))), - (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), - (v2f64 VR128:$src), 1)>; -def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))), - (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), - (v4f32 VR128:$src), 1)>; -} - -// NOTE: We're using FP instructions here, but execution domain fixing can -// convert to integer when profitable. -let Predicates = [HasAVX, NoVLX] in { -def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), - (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), - (v2i64 VR128:$src), 1)>; -def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))), - (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), - (v4i32 VR128:$src), 1)>; -def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))), - (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm), - (v8i16 VR128:$src), 1)>; -def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), - (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm), - (v16i8 VR128:$src), 1)>; -} - -//===----------------------------------------------------------------------===// // Variable Bit Shifts // multiclass avx2_var_shift opc, string OpcodeStr, SDNode OpNode, diff --git a/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll b/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll index 9bd4179..5fce69b 100644 --- a/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll @@ -128,16 +128,14 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm1 -; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-NEXT: vmovaps %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_2f64_4f64_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-NEXT: vmovaps %xmm0, (%rsi) ; X64-NEXT: retq %1 = load <2 x double>, <2 x double>* %p0 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> @@ -150,16 +148,14 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm1 -; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-NEXT: vmovaps %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_2i64_4i64_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-NEXT: vmovaps %xmm0, (%rsi) ; X64-NEXT: retq %1 = load <2 x i64>, <2 x i64>* %p0 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> @@ -172,16 +168,14 @@ define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float> ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm1 -; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-NEXT: vmovaps %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_4f32_8f32_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-NEXT: vmovaps %xmm0, (%rsi) ; X64-NEXT: retq %1 = load <4 x float>, <4 x float>* %p0 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> @@ -194,16 +188,14 @@ define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm1 -; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-NEXT: vmovaps %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_4i32_8i32_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-NEXT: vmovaps %xmm0, (%rsi) ; X64-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> @@ -216,16 +208,14 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm1 -; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-NEXT: vmovaps %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_8i16_16i16_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-NEXT: vmovaps %xmm0, (%rsi) ; X64-NEXT: retq %1 = load <8 x i16>, <8 x i16> *%p0 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> @@ -238,16 +228,14 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm1 -; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-NEXT: vmovaps %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_16i8_32i8_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-NEXT: vmovaps %xmm0, (%rsi) ; X64-NEXT: retq %1 = load <16 x i8>, <16 x i8> *%p0 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll b/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll index 996e679..02e0b96 100644 --- a/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll @@ -121,17 +121,15 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovapd (%ecx), %xmm1 -; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vaddpd {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X32-NEXT: vaddpd {{\.LCPI.*}}, %ymm1, %ymm0 ; X32-NEXT: vmovapd %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_2f64_4f64_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovapd (%rdi), %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X64-NEXT: vaddpd {{.*}}(%rip), %ymm1, %ymm0 ; X64-NEXT: vmovapd %xmm1, (%rsi) ; X64-NEXT: retq %1 = load <2 x double>, <2 x double>* %p0 @@ -146,17 +144,15 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovdqa (%ecx), %xmm1 -; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X32-NEXT: vpaddq {{\.LCPI.*}}, %ymm1, %ymm0 ; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_2i64_4i64_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovdqa (%rdi), %xmm1 -; X64-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X64-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm0 ; X64-NEXT: vmovdqa %xmm1, (%rsi) ; X64-NEXT: retq %1 = load <2 x i64>, <2 x i64>* %p0 @@ -171,17 +167,15 @@ define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float> ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm1 -; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vaddps {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X32-NEXT: vaddps {{\.LCPI.*}}, %ymm1, %ymm0 ; X32-NEXT: vmovaps %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_4f32_8f32_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X64-NEXT: vaddps {{.*}}(%rip), %ymm1, %ymm0 ; X64-NEXT: vmovaps %xmm1, (%rsi) ; X64-NEXT: retq %1 = load <4 x float>, <4 x float>* %p0 @@ -196,17 +190,15 @@ define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovdqa (%ecx), %xmm1 -; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm1, %ymm0 ; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_4i32_8i32_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovdqa (%rdi), %xmm1 -; X64-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X64-NEXT: vpaddd {{.*}}(%rip), %ymm1, %ymm0 ; X64-NEXT: vmovdqa %xmm1, (%rsi) ; X64-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 @@ -221,17 +213,15 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovdqa (%ecx), %xmm1 -; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vpaddw {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X32-NEXT: vpaddw {{\.LCPI.*}}, %ymm1, %ymm0 ; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_8i16_16i16_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovdqa (%rdi), %xmm1 -; X64-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X64-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm0 ; X64-NEXT: vmovdqa %xmm1, (%rsi) ; X64-NEXT: retq %1 = load <8 x i16>, <8 x i16> *%p0 @@ -246,17 +236,15 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovdqa (%ecx), %xmm1 -; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vpaddb {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X32-NEXT: vpaddb {{\.LCPI.*}}, %ymm1, %ymm0 ; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_16i8_32i8_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovdqa (%rdi), %xmm1 -; X64-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X64-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm0 ; X64-NEXT: vmovdqa %xmm1, (%rsi) ; X64-NEXT: retq %1 = load <16 x i8>, <16 x i8> *%p0 diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index b619fae..6bbfe5c 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1112,24 +1112,24 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2, ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqu (%rsi), %xmm0 ; AVX1-NEXT: vmovdqu (%rdx), %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] -; AVX1-NEXT: vmovdqu (%rcx), %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX1-NEXT: vmovdqu (%rcx), %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,u,u,4,5,6,7,u,u,8,9,10,11] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] ; AVX1-NEXT: vmovdqu %xmm0, (%rdi) ; AVX1-NEXT: vmovdqu %xmm4, 32(%rdi) -; AVX1-NEXT: vmovdqu %xmm2, 16(%rdi) +; AVX1-NEXT: vmovdqu %xmm3, 16(%rdi) ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: interleave_24i16_in: diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll index 0a5cf616..0456f1d 100644 --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -141,7 +141,7 @@ define <16 x i32> @PR42819(<8 x i32>* %a0) { ; ; AVX512-LABEL: PR42819: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512-NEXT: movw $-8192, %ax # imm = 0xE000 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll index e6de535..e2f5f36 100644 --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -507,21 +507,33 @@ define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind { ; define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) { -; X86-LABEL: test_broadcast_2f64_4f64_reuse: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 -; X86-NEXT: vmovaps %xmm0, (%eax) -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX-LABEL: test_broadcast_2f64_4f64_reuse: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: retl ; -; X64-LABEL: test_broadcast_2f64_4f64_reuse: -; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X86-AVX512-LABEL: test_broadcast_2f64_4f64_reuse: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX512-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_2f64_4f64_reuse: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_2f64_4f64_reuse: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX512-NEXT: retq %1 = load <2 x double>, <2 x double>* %p0 store <2 x double> %1, <2 x double>* %p1 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> @@ -529,21 +541,33 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub } define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) { -; X86-LABEL: test_broadcast_2i64_4i64_reuse: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 -; X86-NEXT: vmovaps %xmm0, (%eax) -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX-LABEL: test_broadcast_2i64_4i64_reuse: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: retl ; -; X64-LABEL: test_broadcast_2i64_4i64_reuse: -; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X86-AVX512-LABEL: test_broadcast_2i64_4i64_reuse: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX512-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_2i64_4i64_reuse: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_2i64_4i64_reuse: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX512-NEXT: retq %1 = load <2 x i64>, <2 x i64>* %p0 store <2 x i64> %1, <2 x i64>* %p1 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> @@ -551,21 +575,33 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) } define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) { -; X86-LABEL: test_broadcast_4f32_8f32_reuse: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 -; X86-NEXT: vmovaps %xmm0, (%eax) -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX-LABEL: test_broadcast_4f32_8f32_reuse: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: retl ; -; X64-LABEL: test_broadcast_4f32_8f32_reuse: -; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X86-AVX512-LABEL: test_broadcast_4f32_8f32_reuse: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX512-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_4f32_8f32_reuse: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_4f32_8f32_reuse: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX512-NEXT: retq %1 = load <4 x float>, <4 x float>* %p0 store <4 x float> %1, <4 x float>* %p1 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> @@ -573,21 +609,33 @@ define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float> } define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) { -; X86-LABEL: test_broadcast_4i32_8i32_reuse: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 -; X86-NEXT: vmovaps %xmm0, (%eax) -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX-LABEL: test_broadcast_4i32_8i32_reuse: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: retl ; -; X64-LABEL: test_broadcast_4i32_8i32_reuse: -; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X86-AVX512-LABEL: test_broadcast_4i32_8i32_reuse: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX512-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_4i32_8i32_reuse: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_4i32_8i32_reuse: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX512-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 store <4 x i32> %1, <4 x i32>* %p1 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> @@ -595,21 +643,33 @@ define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) } define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind { -; X86-LABEL: test_broadcast_8i16_16i16_reuse: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 -; X86-NEXT: vmovaps %xmm0, (%eax) -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX-LABEL: test_broadcast_8i16_16i16_reuse: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: retl ; -; X64-LABEL: test_broadcast_8i16_16i16_reuse: -; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X86-AVX512-LABEL: test_broadcast_8i16_16i16_reuse: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX512-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_8i16_16i16_reuse: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_8i16_16i16_reuse: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX512-NEXT: retq %1 = load <8 x i16>, <8 x i16> *%p0 store <8 x i16> %1, <8 x i16>* %p1 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> @@ -617,21 +677,33 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p } define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind { -; X86-LABEL: test_broadcast_16i8_32i8_reuse: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 -; X86-NEXT: vmovaps %xmm0, (%eax) -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX-LABEL: test_broadcast_16i8_32i8_reuse: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: retl ; -; X64-LABEL: test_broadcast_16i8_32i8_reuse: -; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X86-AVX512-LABEL: test_broadcast_16i8_32i8_reuse: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX512-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_16i8_32i8_reuse: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_16i8_32i8_reuse: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX512-NEXT: retq %1 = load <16 x i8>, <16 x i8> *%p0 store <16 x i8> %1, <16 x i8>* %p1 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> diff --git a/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll index a0d0a86..3985b9cb 100644 --- a/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll @@ -8,9 +8,8 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" define <4 x i64> @broadcast128(<2 x i64> %src) { ; CHECK-LABEL: broadcast128: ; CHECK: ## %bb.0: -; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 ; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; CHECK-NEXT: retq %1 = alloca <2 x i64>, align 16 %2 = bitcast <2 x i64>* %1 to i8*