From 288c088c177f0a9a994d37b3d7006c2c99f3df57 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 30 Nov 2016 16:33:46 +0000 Subject: [PATCH] [X86][SSE] Add support for target shuffle constant folding Initial support for target shuffle constant folding in cases where all shuffle inputs are constant. We may be able to relax this and merge shuffles with only some constant inputs in the future. I've added the helper function getTargetConstantBitsFromNode (based off a similar function in X86ShuffleDecodeConstantPool.cpp) that could be reused for other cases requiring constant vector extraction. Differential Revision: https://reviews.llvm.org/D27220 llvm-svn: 288250 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 177 ++++++++++++++++++++- .../CodeGen/X86/vector-shuffle-combining-avx.ll | 18 +-- .../CodeGen/X86/vector-shuffle-combining-avx2.ll | 18 +-- .../CodeGen/X86/vector-shuffle-combining-ssse3.ll | 6 +- .../CodeGen/X86/vector-shuffle-combining-xop.ll | 38 ++--- llvm/test/CodeGen/X86/vector-shuffle-mmx.ll | 12 +- llvm/test/CodeGen/X86/vselect-avx.ll | 11 +- llvm/test/CodeGen/X86/widen_load-2.ll | 22 +-- llvm/test/CodeGen/X86/widen_shuffle-1.ll | 10 +- 9 files changed, 217 insertions(+), 95 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index abd2d42..3e2f5f1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4622,9 +4622,9 @@ static SDValue getConstVector(ArrayRef Values, MVT VT, SelectionDAG &DAG, return ConstsNode; } -static SDValue getConstVector(ArrayRef Values, SmallBitVector &Undefs, +static SDValue getConstVector(ArrayRef Bits, SmallBitVector &Undefs, MVT VT, SelectionDAG &DAG, const SDLoc &dl) { - assert(Values.size() == Undefs.size() && "Unequal constant and undef arrays"); + assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays"); SmallVector Ops; bool Split = false; @@ -4637,16 +4637,22 @@ static SDValue getConstVector(ArrayRef Values, SmallBitVector &Undefs, } MVT EltVT = ConstVecVT.getVectorElementType(); - for (unsigned i = 0, e = Values.size(); i != e; ++i) { + for (unsigned i = 0, e = Bits.size(); i != e; ++i) { if (Undefs[i]) { Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT)); continue; } - const APInt &V = Values[i]; + const APInt &V = Bits[i]; assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes"); if (Split) { Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT)); Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT)); + } else if (EltVT == MVT::f32) { + APFloat FV(APFloat::IEEEsingle, V); + Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); + } else if (EltVT == MVT::f64) { + APFloat FV(APFloat::IEEEdouble, V); + Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); } else { Ops.push_back(DAG.getConstant(V, dl, EltVT)); } @@ -5037,6 +5043,77 @@ static const Constant *getTargetConstantFromNode(SDValue Op) { return dyn_cast(CNode->getConstVal()); } +// Extract constant bits from constant pool vector. +static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, + SmallBitVector &UndefElts, + SmallVectorImpl &EltBits) { + assert(UndefElts.empty() && "Expected an empty UndefElts vector"); + assert(EltBits.empty() && "Expected an empty EltBits vector"); + + EVT VT = Op.getValueType(); + unsigned SizeInBits = VT.getSizeInBits(); + assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"); + unsigned NumElts = SizeInBits / EltSizeInBits; + + auto *Cst = getTargetConstantFromNode(Op); + if (!Cst) + return false; + + Type *CstTy = Cst->getType(); + if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits())) + return false; + + // Extract all the undef/constant element data and pack into single bitsets. + APInt UndefBits(SizeInBits, 0); + APInt MaskBits(SizeInBits, 0); + + unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits(); + for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) { + auto *COp = Cst->getAggregateElement(i); + if (!COp || + !(isa(COp) || isa(COp) || + isa(COp))) + return false; + + if (isa(COp)) { + APInt EltUndef = APInt::getLowBitsSet(SizeInBits, CstEltSizeInBits); + UndefBits |= EltUndef.shl(i * CstEltSizeInBits); + continue; + } + + APInt Bits; + if (auto *CInt = dyn_cast(COp)) + Bits = CInt->getValue(); + else if (auto *CFP = dyn_cast(COp)) + Bits = CFP->getValueAPF().bitcastToAPInt(); + + Bits = Bits.zextOrTrunc(SizeInBits); + MaskBits |= Bits.shl(i * CstEltSizeInBits); + } + + UndefElts = SmallBitVector(NumElts, false); + EltBits.resize(NumElts, APInt(EltSizeInBits, 0)); + + // Now extract the undef/constant bit data into the target elts. + for (unsigned i = 0; i != NumElts; ++i) { + APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits); + UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits); + + // Only treat the element as UNDEF if all bits are UNDEF, otherwise + // treat it as zero. + if (UndefEltBits.isAllOnesValue()) { + UndefElts[i] = true; + continue; + } + + APInt Bits = MaskBits.lshr(i * EltSizeInBits); + Bits = Bits.zextOrTrunc(EltSizeInBits); + EltBits[i] = Bits.getZExtValue(); + } + + return true; +} + static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl &RawMask) { @@ -26308,6 +26385,93 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return false; } +// Attempt to constant fold all of the constant source ops. +// Returns true if the entire shuffle is folded to a constant. +// TODO: Extend this to merge multiple constant Ops and update the mask. +static bool combineX86ShufflesConstants(const SmallVectorImpl &Ops, + ArrayRef Mask, SDValue Root, + bool HasVariableMask, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + MVT VT = Root.getSimpleValueType(); + + unsigned SizeInBits = VT.getSizeInBits(); + unsigned NumMaskElts = Mask.size(); + unsigned MaskSizeInBits = SizeInBits / NumMaskElts; + unsigned NumOps = Ops.size(); + + // Extract constant bits from each source op. + bool OneUseConstantOp = false; + SmallVector UndefEltsOps(NumOps); + SmallVector, 4> RawBitsOps(NumOps); + for (unsigned i = 0; i != NumOps; ++i) { + SDValue SrcOp = Ops[i]; + OneUseConstantOp |= SrcOp.hasOneUse(); + if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i], + RawBitsOps[i])) + return false; + } + + // Only fold if at least one of the constants is only used once or + // the combined shuffle has included a variable mask shuffle, this + // is to avoid constant pool bloat. + if (!OneUseConstantOp && !HasVariableMask) + return false; + + // Shuffle the constant bits according to the mask. + SmallBitVector UndefElts(NumMaskElts, false); + SmallBitVector ZeroElts(NumMaskElts, false); + SmallBitVector ConstantElts(NumMaskElts, false); + SmallVector ConstantBitData(NumMaskElts, + APInt::getNullValue(MaskSizeInBits)); + for (unsigned i = 0; i != NumMaskElts; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) { + UndefElts[i] = true; + continue; + } else if (M == SM_SentinelZero) { + ZeroElts[i] = true; + continue; + } + assert(0 <= M && M < (int)(NumMaskElts * NumOps)); + + unsigned SrcOpIdx = (unsigned)M / NumMaskElts; + unsigned SrcMaskIdx = (unsigned)M % NumMaskElts; + + auto &SrcUndefElts = UndefEltsOps[SrcOpIdx]; + if (SrcUndefElts[SrcMaskIdx]) { + UndefElts[i] = true; + continue; + } + + auto &SrcEltBits = RawBitsOps[SrcOpIdx]; + APInt &Bits = SrcEltBits[SrcMaskIdx]; + if (!Bits) { + ZeroElts[i] = true; + continue; + } + + ConstantElts[i] = true; + ConstantBitData[i] = Bits; + } + assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts); + + // Create the constant data. + MVT MaskSVT; + if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64)) + MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits); + else + MaskSVT = MVT::getIntegerVT(MaskSizeInBits); + + MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts); + + SDLoc DL(Root); + SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL); + DCI.AddToWorklist(CstOp.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp)); + return true; +} + /// \brief Fully generic combining of x86 shuffle instructions. /// /// This should be the last combine run over the x86 shuffle instructions. Once @@ -26491,6 +26655,11 @@ static bool combineX86ShufflesRecursively(ArrayRef SrcOps, HasVariableMask, DAG, DCI, Subtarget)) return true; + // Attempt to constant fold all of the constant source ops. + if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI, + Subtarget)) + return true; + // We can only combine unary and binary shuffle mask cases. if (Ops.size() > 2) return false; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 2d1bf08..1e22fde 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -361,12 +361,12 @@ define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) { define <2 x double> @constant_fold_vpermilvar_pd() { ; X32-LABEL: constant_fold_vpermilvar_pd: ; X32: # BB#0: -; X32-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0] +; X32-NEXT: vmovaps {{.*#+}} xmm0 = [2.000000e+00,1.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpermilvar_pd: ; X64: # BB#0: -; X64-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0] +; X64-NEXT: vmovaps {{.*#+}} xmm0 = [2.000000e+00,1.000000e+00] ; X64-NEXT: retq %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> , <2 x i64> ) ret <2 x double> %1 @@ -375,12 +375,12 @@ define <2 x double> @constant_fold_vpermilvar_pd() { define <4 x double> @constant_fold_vpermilvar_pd_256() { ; X32-LABEL: constant_fold_vpermilvar_pd_256: ; X32: # BB#0: -; X32-NEXT: vpermilpd {{.*#+}} ymm0 = mem[1,0,2,3] +; X32-NEXT: vmovaps {{.*#+}} ymm0 = [2.000000e+00,1.000000e+00,3.000000e+00,4.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpermilvar_pd_256: ; X64: # BB#0: -; X64-NEXT: vpermilpd {{.*#+}} ymm0 = mem[1,0,2,3] +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [2.000000e+00,1.000000e+00,3.000000e+00,4.000000e+00] ; X64-NEXT: retq %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> , <4 x i64> ) ret <4 x double> %1 @@ -389,12 +389,12 @@ define <4 x double> @constant_fold_vpermilvar_pd_256() { define <4 x float> @constant_fold_vpermilvar_ps() { ; X32-LABEL: constant_fold_vpermilvar_ps: ; X32: # BB#0: -; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,0,2,1] +; X32-NEXT: vmovaps {{.*#+}} xmm0 = [4.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpermilvar_ps: ; X64: # BB#0: -; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,0,2,1] +; X64-NEXT: vmovaps {{.*#+}} xmm0 = [4.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00] ; X64-NEXT: retq %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> , <4 x i32> ) ret <4 x float> %1 @@ -403,14 +403,12 @@ define <4 x float> @constant_fold_vpermilvar_ps() { define <8 x float> @constant_fold_vpermilvar_ps_256() { ; X32-LABEL: constant_fold_vpermilvar_ps_256: ; X32: # BB#0: -; X32-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] -; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,5,5] +; X32-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00,5.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpermilvar_ps_256: ; X64: # BB#0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] -; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,5,5] +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00,5.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00] ; X64-NEXT: retq %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> , <8 x i32> ) ret <8 x float> %1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 8c5e7b4..b6afefb 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -685,14 +685,12 @@ define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) { define <8 x i32> @constant_fold_permd() { ; X32-LABEL: constant_fold_permd: ; X32: # BB#0: -; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0] -; X32-NEXT: vpermd {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_permd: ; X64: # BB#0: -; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0] -; X64-NEXT: vpermd {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1] ; X64-NEXT: retq %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> , <8 x i32> ) ret <8 x i32> %1 @@ -701,14 +699,12 @@ define <8 x i32> @constant_fold_permd() { define <8 x float> @constant_fold_permps() { ; X32-LABEL: constant_fold_permps: ; X32: # BB#0: -; X32-NEXT: vmovaps {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0] -; X32-NEXT: vpermps {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-NEXT: vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_permps: ; X64: # BB#0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0] -; X64-NEXT: vpermps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00] ; X64-NEXT: retq %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> , <8 x i32> ) ret <8 x float> %1 @@ -717,14 +713,12 @@ define <8 x float> @constant_fold_permps() { define <32 x i8> @constant_fold_pshufb_256() { ; X32-LABEL: constant_fold_pshufb_256: ; X32: # BB#0: -; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241] -; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[15],zero,zero,zero,zero,zero,ymm0[7,6,17],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[31],zero,zero,zero,zero,zero,ymm0[23,22] +; X32-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250> ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_pshufb_256: ; X64: # BB#0: -; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241] -; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[15],zero,zero,zero,zero,zero,ymm0[7,6,17],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[31],zero,zero,zero,zero,zero,ymm0[23,22] +; X64-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250> ; X64-NEXT: retq %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> , <32 x i8> ) ret <32 x i8> %1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index 2ae0fae..10627ae 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -511,14 +511,12 @@ define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) { define <16 x i8> @constant_fold_pshufb() { ; SSE-LABEL: constant_fold_pshufb: ; SSE: # BB#0: -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[u,u],zero,zero,xmm0[15],zero,zero,zero,zero,zero,xmm0[7,6] +; SSE-NEXT: movaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9> ; SSE-NEXT: retq ; ; AVX-LABEL: constant_fold_pshufb: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[u,u],zero,zero,xmm0[15],zero,zero,zero,zero,zero,xmm0[7,6] +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9> ; AVX-NEXT: retq %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> , <16 x i8> ) ret <16 x i8> %1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll index c1d4446..696f87d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -261,14 +261,12 @@ define <4 x i32> @combine_vpperm_10zz32BA(<4 x i32> %a0, <4 x i32> %a1) { define <2 x double> @constant_fold_vpermil2pd() { ; X32-LABEL: constant_fold_vpermil2pd: ; X32: # BB#0: -; X32-NEXT: vmovapd {{.*#+}} xmm0 = [-2.000000e+00,-1.000000e+00] -; X32-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] +; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpermil2pd: ; X64: # BB#0: -; X64-NEXT: vmovapd {{.*#+}} xmm0 = [-2.000000e+00,-1.000000e+00] -; X64-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] +; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00] ; X64-NEXT: retq %1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> , <2 x double> , <2 x i64> , i8 2) ret <2 x double> %1 @@ -277,16 +275,12 @@ define <2 x double> @constant_fold_vpermil2pd() { define <4 x double> @constant_fold_vpermil2pd_256() { ; X32-LABEL: constant_fold_vpermil2pd_256: ; X32: # BB#0: -; X32-NEXT: vmovapd {{.*#+}} ymm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00] -; X32-NEXT: vmovapd {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] -; X32-NEXT: vpermil2pd {{.*#+}} ymm0 = ymm0[0],zero,ymm1[3,2] +; X32-NEXT: vmovaps {{.*#+}} ymm0 = [-4.000000e+00,0.000000e+00,4.000000e+00,3.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpermil2pd_256: ; X64: # BB#0: -; X64-NEXT: vmovapd {{.*#+}} ymm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00] -; X64-NEXT: vmovapd {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] -; X64-NEXT: vpermil2pd {{.*#+}} ymm0 = ymm0[0],zero,ymm1[3,2] +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [-4.000000e+00,0.000000e+00,4.000000e+00,3.000000e+00] ; X64-NEXT: retq %1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> , <4 x double> , <4 x i64> , i8 2) ret <4 x double> %1 @@ -295,16 +289,12 @@ define <4 x double> @constant_fold_vpermil2pd_256() { define <4 x float> @constant_fold_vpermil2ps() { ; X32-LABEL: constant_fold_vpermil2ps: ; X32: # BB#0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00] -; X32-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] -; X32-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0,2],zero +; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpermil2ps: ; X64: # BB#0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00] -; X64-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] -; X64-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0,2],zero +; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00] ; X64-NEXT: retq %1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> , <4 x float> , <4 x i32> , i8 2) ret <4 x float> %1 @@ -313,16 +303,12 @@ define <4 x float> @constant_fold_vpermil2ps() { define <8 x float> @constant_fold_vpermil2ps_256() { ; X32-LABEL: constant_fold_vpermil2ps_256: ; X32: # BB#0: -; X32-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,-7.000000e+00,-6.000000e+00,-5.000000e+00,-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00] -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] -; X32-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[0,2],zero,ymm1[4],zero,ymm1[4,6] +; X32-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00,5.000000e+00,0.000000e+00,5.000000e+00,7.000000e+00] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpermil2ps_256: ; X64: # BB#0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,-7.000000e+00,-6.000000e+00,-5.000000e+00,-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00] -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] -; X64-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[0,2],zero,ymm1[4],zero,ymm1[4,6] +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00,5.000000e+00,0.000000e+00,5.000000e+00,7.000000e+00] ; X64-NEXT: retq %1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> , <8 x float> , <8 x i32> , i8 2) ret <8 x float> %1 @@ -331,16 +317,12 @@ define <8 x float> @constant_fold_vpermil2ps_256() { define <16 x i8> @constant_fold_vpperm() { ; X32-LABEL: constant_fold_vpperm: ; X32: # BB#0: -; X32-NEXT: vmovdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241] -; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; X32-NEXT: retl ; ; X64-LABEL: constant_fold_vpperm: ; X64: # BB#0: -; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241] -; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; X64-NEXT: retq %1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> , <16 x i8> , <16 x i8> ) ret <16 x i8> %1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll index 37f9ea9..7b69e7d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll @@ -42,10 +42,8 @@ define void @test1() { ; X32-NEXT: xorps %xmm0, %xmm0 ; X32-NEXT: movlps %xmm0, (%esp) ; X32-NEXT: movq (%esp), %mm0 -; X32-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] -; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: movq {{[0-9]+}}(%esp), %mm1 ; X32-NEXT: xorl %edi, %edi ; X32-NEXT: maskmovq %mm1, %mm0 @@ -58,10 +56,8 @@ define void @test1() { ; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] -; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm1 ; X64-NEXT: xorl %edi, %edi ; X64-NEXT: maskmovq %mm1, %mm0 diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll index c453724b..5503cfc 100644 --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -18,13 +18,10 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" define void @test(<4 x i16>* %a, <4 x i16>* %b) { ; AVX-LABEL: test: ; AVX: ## BB#0: ## %body -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [65533,124,125,14807] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vmovq %xmm1, (%rdi) -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,65535] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: movq {{.*}}(%rip), %rax +; AVX-NEXT: movq %rax, (%rdi) +; AVX-NEXT: movq {{.*}}(%rip), %rax +; AVX-NEXT: movq %rax, (%rsi) ; AVX-NEXT: retq body: %predphi = select <4 x i1> , <4 x i16> , <4 x i16> diff --git a/llvm/test/CodeGen/X86/widen_load-2.ll b/llvm/test/CodeGen/X86/widen_load-2.ll index 0e1f370..1da9952 100644 --- a/llvm/test/CodeGen/X86/widen_load-2.ll +++ b/llvm/test/CodeGen/X86/widen_load-2.ll @@ -372,15 +372,10 @@ define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pa ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u> -; X86-NEXT: movdqa {{.*#+}} xmm1 = <158,158,158,u> -; X86-NEXT: pshufb %xmm0, %xmm1 -; X86-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; X86-NEXT: pextrw $0, %xmm1, (%edx) +; X86-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X86-NEXT: pextrw $0, %xmm0, (%edx) ; X86-NEXT: movb $-98, 2(%edx) -; X86-NEXT: movdqa {{.*#+}} xmm1 = <1,1,1,u> -; X86-NEXT: pshufb %xmm0, %xmm1 -; X86-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X86-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero ; X86-NEXT: pextrw $0, %xmm0, (%ecx) ; X86-NEXT: movb $1, 2(%ecx) ; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero @@ -396,15 +391,10 @@ define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pa ; ; X64-LABEL: rot: ; X64: # BB#0: # %entry -; X64-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u> -; X64-NEXT: movdqa {{.*#+}} xmm1 = <158,158,158,u> -; X64-NEXT: pshufb %xmm0, %xmm1 -; X64-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; X64-NEXT: pextrw $0, %xmm1, (%rsi) +; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X64-NEXT: pextrw $0, %xmm0, (%rsi) ; X64-NEXT: movb $-98, 2(%rsi) -; X64-NEXT: movdqa {{.*#+}} xmm1 = <1,1,1,u> -; X64-NEXT: pshufb %xmm0, %xmm1 -; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero ; X64-NEXT: pextrw $0, %xmm0, (%rdx) ; X64-NEXT: movb $1, 2(%rdx) ; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero diff --git a/llvm/test/CodeGen/X86/widen_shuffle-1.ll b/llvm/test/CodeGen/X86/widen_shuffle-1.ll index 781cad5..aeb4e21 100644 --- a/llvm/test/CodeGen/X86/widen_shuffle-1.ll +++ b/llvm/test/CodeGen/X86/widen_shuffle-1.ll @@ -111,16 +111,14 @@ define void @shuf5(<8 x i8>* %p) nounwind { ; X86-LABEL: shuf5: ; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movdqa {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33] -; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; X86-NEXT: movq %xmm0, (%eax) +; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: movsd %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: shuf5: ; X64: # BB#0: -; X64-NEXT: movdqa {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33] -; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; X64-NEXT: movq %xmm0, (%rdi) +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movq %rax, (%rdi) ; X64-NEXT: retq %v = shufflevector <2 x i8> , <2 x i8> undef, <8 x i32> store <8 x i8> %v, <8 x i8>* %p, align 8 -- 2.7.4