This patch adds an initial x86 SimplifyDemandedVectorEltsForTargetNode implementation to handle target shuffles.
Currently the patch only decodes a target shuffle, calls SimplifyDemandedVectorElts on its input operands and removes any shuffle that reduces to undef/zero/identity.
Future work will need to integrate this with combineX86ShufflesRecursively, add support for other x86 ops, etc.
NOTE: There is a minor regression that appears to be affecting further (extractelement?) combines which I haven't been able to solve yet - possibly something to do with how nodes are added to the worklist after simplification.
Differential Revision: https://reviews.llvm.org/D52140
llvm-svn: 342564
{Op}, 0, Op, {0}, {}, /*Depth*/ 1,
/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return Res;
+
+ // Simplify source operands based on shuffle mask.
+ // TODO - merge this into combineX86ShufflesRecursively.
+ APInt KnownUndef, KnownZero;
+ APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+ if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI))
+ return SDValue(N, 0);
}
return SDValue();
}
+bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
+ SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
+ TargetLoweringOpt &TLO, unsigned Depth) const {
+ int NumElts = DemandedElts.getBitWidth();
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+
+ // Handle special case opcodes.
+ switch (Opc) {
+ case X86ISD::VBROADCAST: {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ if (!SrcVT.isVector())
+ return false;
+ APInt SrcUndef, SrcZero;
+ APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
+ if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
+ Depth + 1))
+ return true;
+ break;
+ }
+ }
+
+ // Simplify target shuffles.
+ if (!isTargetShuffle(Opc))
+ return false;
+
+ // Get target shuffle mask.
+ SmallVector<int, 64> OpMask;
+ SmallVector<SDValue, 2> OpInputs;
+ if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, TLO.DAG))
+ return false;
+
+ // Shuffle inputs must be the same type as the result.
+ if (llvm::any_of(OpInputs,
+ [VT](SDValue V) { return VT != V.getValueType(); }))
+ return false;
+
+ // Attempt to simplify inputs.
+ int NumSrcs = OpInputs.size();
+ for (int Src = 0; Src != NumSrcs; ++Src) {
+ int Lo = Src * NumElts;
+ APInt SrcElts = APInt::getNullValue(NumElts);
+ for (int i = 0; i != NumElts; ++i)
+ if (DemandedElts[i]) {
+ int M = OpMask[i] - Lo;
+ if (0 <= M && M < NumElts)
+ SrcElts.setBit(M);
+ }
+
+ APInt SrcUndef, SrcZero;
+ if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
+ TLO, Depth + 1))
+ return true;
+ }
+
+ // Check if shuffle mask can be simplified to undef/zero/identity.
+ for (int i = 0; i != NumElts; ++i)
+ if (!DemandedElts[i])
+ OpMask[i] = SM_SentinelUndef;
+
+ if (isUndefInRange(OpMask, 0, NumElts)) {
+ KnownUndef.setAllBits();
+ return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
+ }
+ if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
+ KnownZero.setAllBits();
+ return TLO.CombineTo(
+ Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
+ }
+ for (int Src = 0; Src != NumSrcs; ++Src)
+ if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
+ return TLO.CombineTo(Op, OpInputs[Src]);
+
+ // Extract known zero/undef elements.
+ // TODO - Propagate input undef/zero elts.
+ for (int i = 0; i != NumElts; ++i) {
+ if (OpMask[i] == SM_SentinelUndef)
+ KnownUndef.setBit(i);
+ if (OpMask[i] == SM_SentinelZero)
+ KnownZero.setBit(i);
+ }
+
+ return false;
+}
+
/// Check if a vector extract from a target-specific shuffle of a load can be
/// folded into a single element load.
/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
const SelectionDAG &DAG,
unsigned Depth) const override;
+ bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op,
+ const APInt &DemandedElts,
+ APInt &KnownUndef,
+ APInt &KnownZero,
+ TargetLoweringOpt &TLO,
+ unsigned Depth) const override;
+
SDValue unwrapAddress(SDValue N) const override;
bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmovaps (%eax), %ymm0
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1]
-; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: psrad $3, %xmm3
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrad $2, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
+; SSE2-NEXT: psrad $2, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; SSE41-NEXT: paddd %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psrad $4, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrad $3, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: psrad $4, %xmm2
-; SSE41-NEXT: psrad $2, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
+; SSE41-NEXT: psrad $2, %xmm3
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: psrad $3, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpsrad $3, %xmm1, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX1-NEXT: retq
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psrad $3, %xmm4
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrad $2, %xmm3
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[0,3]
+; SSE2-NEXT: psrad $2, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: psrad $3, %xmm4
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psrad $2, %xmm3
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm4[0,3]
+; SSE2-NEXT: psrad $2, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; SSE2-NEXT: movaps %xmm2, %xmm1
; SSE2-NEXT: retq
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrad $31, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: psrld $28, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psrld $28, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: psrld $30, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: psrld $29, %xmm3
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
-; SSE41-NEXT: paddd %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: psrad $3, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: psrad $4, %xmm3
-; SSE41-NEXT: psrad $2, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: psrld $29, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
+; SSE41-NEXT: paddd %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psrad $4, %xmm4
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: psrad $2, %xmm5
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT: psrad $3, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrad $31, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: psrld $28, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psrld $30, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: psrld $29, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
-; SSE41-NEXT: paddd %xmm1, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: psrad $3, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psrld $28, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm5
+; SSE41-NEXT: psrld $30, %xmm5
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT: psrld $29, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
+; SSE41-NEXT: paddd %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: psrad $4, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm4
; SSE41-NEXT: psrad $2, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: psrad $3, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm6
; SSE2-NEXT: psrad $3, %xmm6
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: psrad $2, %xmm5
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3]
+; SSE2-NEXT: psrad $2, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: psrad $3, %xmm6
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: psrad $2, %xmm5
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3]
+; SSE2-NEXT: psrad $2, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3]
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm6
; SSE2-NEXT: psrad $3, %xmm6
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: psrad $2, %xmm5
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm6[0,3]
+; SSE2-NEXT: psrad $2, %xmm4
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
; SSE2-NEXT: movdqa %xmm3, %xmm5
; SSE2-NEXT: psrad $31, %xmm5
; SSE2-NEXT: movdqa %xmm5, %xmm6
; SSE2-NEXT: psrad $3, %xmm6
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: psrad $2, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm6[0,3]
+; SSE2-NEXT: psrad $2, %xmm5
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
; SSE2-NEXT: movaps %xmm4, %xmm2
; SSE2-NEXT: movaps %xmm5, %xmm3
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm1, %xmm4
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: psrad $31, %xmm6
-; SSE41-NEXT: movdqa %xmm6, %xmm0
-; SSE41-NEXT: psrld $28, %xmm0
-; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: psrld $28, %xmm5
+; SSE41-NEXT: movdqa %xmm0, %xmm7
; SSE41-NEXT: psrld $30, %xmm7
-; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: psrld $29, %xmm6
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7]
-; SSE41-NEXT: paddd %xmm1, %xmm6
-; SSE41-NEXT: movdqa %xmm6, %xmm7
-; SSE41-NEXT: movdqa %xmm6, %xmm0
-; SSE41-NEXT: psrad $3, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: psrld $29, %xmm0
+; SSE41-NEXT: pxor %xmm6, %xmm6
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: psrad $4, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7]
+; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: psrad $4, %xmm5
+; SSE41-NEXT: movdqa %xmm0, %xmm7
; SSE41-NEXT: psrad $2, %xmm7
-; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: psrad $3, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: psrad $31, %xmm6
-; SSE41-NEXT: movdqa %xmm6, %xmm1
-; SSE41-NEXT: psrld $28, %xmm1
-; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: movdqa %xmm4, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm5
+; SSE41-NEXT: psrld $28, %xmm5
+; SSE41-NEXT: movdqa %xmm1, %xmm7
; SSE41-NEXT: psrld $30, %xmm7
-; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: psrld $29, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7]
-; SSE41-NEXT: paddd %xmm4, %xmm6
-; SSE41-NEXT: movdqa %xmm6, %xmm7
-; SSE41-NEXT: movdqa %xmm6, %xmm1
-; SSE41-NEXT: psrad $3, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: psrld $29, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: psrad $4, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5],xmm7[6,7]
+; SSE41-NEXT: paddd %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm5
+; SSE41-NEXT: psrad $4, %xmm5
+; SSE41-NEXT: movdqa %xmm1, %xmm7
; SSE41-NEXT: psrad $2, %xmm7
-; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: psrad $3, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5],xmm7[6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: psrad $31, %xmm6
-; SSE41-NEXT: movdqa %xmm6, %xmm4
-; SSE41-NEXT: psrld $28, %xmm4
-; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psrad $31, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm5
+; SSE41-NEXT: psrld $28, %xmm5
+; SSE41-NEXT: movdqa %xmm4, %xmm7
; SSE41-NEXT: psrld $30, %xmm7
-; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm4[4,5,6,7]
-; SSE41-NEXT: psrld $29, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7]
-; SSE41-NEXT: paddd %xmm2, %xmm6
-; SSE41-NEXT: movdqa %xmm6, %xmm7
-; SSE41-NEXT: movdqa %xmm6, %xmm4
-; SSE41-NEXT: psrad $3, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: psrld $29, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
-; SSE41-NEXT: psrad $4, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3],xmm4[4,5],xmm7[6,7]
+; SSE41-NEXT: paddd %xmm2, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm5
+; SSE41-NEXT: psrad $4, %xmm5
+; SSE41-NEXT: movdqa %xmm4, %xmm7
; SSE41-NEXT: psrad $2, %xmm7
-; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: psrad $3, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3],xmm4[4,5],xmm7[6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: psrld $28, %xmm6
-; SSE41-NEXT: movdqa %xmm2, %xmm7
+; SSE41-NEXT: movdqa %xmm3, %xmm5
+; SSE41-NEXT: psrad $31, %xmm5
+; SSE41-NEXT: movdqa %xmm5, %xmm2
+; SSE41-NEXT: psrld $28, %xmm2
+; SSE41-NEXT: movdqa %xmm5, %xmm7
; SSE41-NEXT: psrld $30, %xmm7
-; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: psrld $29, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3],xmm2[4,5],xmm7[6,7]
-; SSE41-NEXT: paddd %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: movdqa %xmm2, %xmm5
-; SSE41-NEXT: psrad $3, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: psrld $29, %xmm5
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7]
+; SSE41-NEXT: paddd %xmm3, %xmm5
+; SSE41-NEXT: movdqa %xmm5, %xmm2
; SSE41-NEXT: psrad $4, %xmm2
+; SSE41-NEXT: movdqa %xmm5, %xmm6
; SSE41-NEXT: psrad $2, %xmm6
; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: psrad $3, %xmm5
; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psrad $3, %xmm3
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $2, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[0,3]
+; SSE2-NEXT: psrad $2, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: psubd %xmm0, %xmm1
;
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrld $28, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrld $28, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: psrld $30, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: psrld $29, %xmm2
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT: paddd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrad $3, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: psrad $4, %xmm2
-; SSE41-NEXT: psrad $2, %xmm3
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; SSE41-NEXT: psubd %xmm1, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrld $29, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrad $4, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psrad $2, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: psrad $3, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT: psubd %xmm0, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
; AVX1-NEXT: vpsrad $2, %xmm1, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpsrad $3, %xmm1, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm1
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
; SSE-NEXT: paddd %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psrad $3, %xmm2
; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: psrad $2, %xmm3
-; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; SSE-NEXT: psrad $3, %xmm1
-; SSE-NEXT: psrad $1, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3,4,5,6,7]
-; SSE-NEXT: pmulld {{.*}}(%rip), %xmm3
-; SSE-NEXT: psubd %xmm3, %xmm0
+; SSE-NEXT: psrad $1, %xmm3
+; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; SSE-NEXT: psrad $2, %xmm1
+; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1
+; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vec_srem_by_pow2b:
; AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
; AVX1-NEXT: vpsrad $1, %xmm1, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: imull {{[0-9]+}}(%esp), %ecx
; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movzwl 4(%eax,%ecx), %edx
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: movd %edx, %xmm1
-; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
; X32-NEXT: movd %xmm0, %eax
; X32-NEXT: retl
; X64-NEXT: shlq $32, %rcx
; X64-NEXT: orq %rax, %rcx
; X64-NEXT: movq %rcx, %xmm0
-; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
; X64-NEXT: movd %xmm0, %eax
; X64-NEXT: retq
entry:
;
; X64-SSSE3-LABEL: t3:
; X64-SSSE3: # %bb.0: # %bb
-; X64-SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
-; X64-SSSE3-NEXT: movlpd %xmm0, (%rax)
+; X64-SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSSE3-NEXT: movsd %xmm0, (%rax)
; X64-SSSE3-NEXT: retq
;
; X64-AVX-LABEL: t3:
; X64-AVX: # %bb.0: # %bb
-; X64-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; X64-AVX-NEXT: vmovlpd %xmm0, (%rax)
+; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX-NEXT: vmovsd %xmm0, (%rax)
; X64-AVX-NEXT: retq
bb:
%tmp13 = load <2 x double>, <2 x double>* %a0, align 1
;
; X64-SSSE3-LABEL: t4:
; X64-SSSE3: # %bb.0:
-; X64-SSSE3-NEXT: movq (%rdi), %rax
+; X64-SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-SSSE3-NEXT: movq %xmm0, %rax
; X64-SSSE3-NEXT: retq
;
; X64-AVX-LABEL: t4:
define float @extract_lane_insertps_6123(<4 x float> %a0, <4 x float> *%p1) {
; SSE-LABEL: extract_lane_insertps_6123:
; SSE: # %bb.0:
-; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movaps (%rdi), %xmm0
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: extract_lane_insertps_6123:
; AVX: # %bb.0:
-; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0]
; AVX-NEXT: retq
%a1 = load <4 x float>, <4 x float> *%p1
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 128)
define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
; SSE2-LABEL: interleave_24i32_out:
; SSE2: # %bb.0:
+; SSE2-NEXT: movdqu 64(%rdi), %xmm10
; SSE2-NEXT: movups 80(%rdi), %xmm8
-; SSE2-NEXT: movups 64(%rdi), %xmm11
-; SSE2-NEXT: movups (%rdi), %xmm0
-; SSE2-NEXT: movups 16(%rdi), %xmm10
-; SSE2-NEXT: movups 32(%rdi), %xmm9
-; SSE2-NEXT: movdqu 48(%rdi), %xmm1
-; SSE2-NEXT: movaps %xmm0, %xmm6
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm10[2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
-; SSE2-NEXT: movaps %xmm9, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[0,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[2,0]
-; SSE2-NEXT: movaps %xmm0, %xmm3
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm9[2,0]
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm11[2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; SSE2-NEXT: movdqu (%rdi), %xmm0
+; SSE2-NEXT: movdqu 16(%rdi), %xmm11
+; SSE2-NEXT: movups 32(%rdi), %xmm5
+; SSE2-NEXT: movdqu 48(%rdi), %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
+; SSE2-NEXT: movaps %xmm5, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[2,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[2,0]
; SSE2-NEXT: movaps %xmm8, %xmm5
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[0,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm1[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm8[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm11[0,0]
-; SSE2-NEXT: movaps %xmm4, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[3,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0]
-; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[3,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm1[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm12[2,0]
-; SSE2-NEXT: movups %xmm6, 16(%rsi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,0,1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm10[2,3]
+; SSE2-NEXT: movdqa %xmm9, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm8[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0]
+; SSE2-NEXT: movaps %xmm2, %xmm4
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm10[3,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0]
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm11[3,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[2,0]
+; SSE2-NEXT: movups %xmm9, 16(%rsi)
; SSE2-NEXT: movups %xmm3, (%rsi)
-; SSE2-NEXT: movups %xmm4, 16(%rdx)
+; SSE2-NEXT: movups %xmm2, 16(%rdx)
; SSE2-NEXT: movups %xmm0, (%rdx)
-; SSE2-NEXT: movups %xmm2, 16(%rcx)
-; SSE2-NEXT: movups %xmm7, (%rcx)
+; SSE2-NEXT: movups %xmm1, 16(%rcx)
+; SSE2-NEXT: movups %xmm6, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: interleave_24i32_out:
; CHECK-LABEL: pr26070:
; CHECK: ## %bb.0:
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; CHECK-NEXT: retq
%c = call float @copysignf(float 1.0, float undef) readnone
ret float %c
; CHECK-NEXT: vmovaps 48(%rbp), %ymm14
; CHECK-NEXT: vmovaps 16(%rbp), %ymm15
; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
-; CHECK-NEXT: vxorps %xmm6, %xmm6, %xmm6
-; CHECK-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7]
-; CHECK-NEXT: # kill: def $xmm9 killed $xmm9 killed $ymm9
+; CHECK-NEXT: vmovaps %xmm9, %xmm6
+; CHECK-NEXT: vmovdqa %xmm6, %xmm9
+; CHECK-NEXT: # kill: def $ymm9 killed $xmm9
; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vmovdqa %xmm9, %xmm0
-; CHECK-NEXT: # kill: def $ymm0 killed $xmm0
-; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
-; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,0]
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: # implicit-def: $ymm0
-; CHECK-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0
+; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
+; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,0]
; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7]
-; CHECK-NEXT: vmovaps %xmm2, %xmm9
+; CHECK-NEXT: vmovaps %xmm2, %xmm6
; CHECK-NEXT: # implicit-def: $ymm2
-; CHECK-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2
-; CHECK-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
-; CHECK-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; CHECK-NEXT: vmovaps %xmm7, %xmm9
-; CHECK-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7]
-; CHECK-NEXT: # implicit-def: $ymm6
-; CHECK-NEXT: vmovaps %xmm9, %xmm6
+; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,3]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm11[2,3],ymm7[4,5],ymm11[6,7]
+; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
+; CHECK-NEXT: vmovaps %xmm7, %xmm6
+; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: # implicit-def: $ymm11
+; CHECK-NEXT: vmovaps %xmm6, %xmm11
+; CHECK-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; CHECK-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,3]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
; CHECK-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7]
; CHECK-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,1,3]
; CHECK-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,1,4,5,4,5]
; CHECK-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vmovaps %ymm5, %ymm1
; CHECK-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vmovaps %ymm6, %ymm3
+; CHECK-NEXT: vmovaps %ymm9, %ymm3
; CHECK-NEXT: vmovaps %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vmovaps %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vmovaps %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vmovaps %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovaps %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vmovaps %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vmovaps %ymm14, (%rsp) # 32-byte Spill
; CHECK-NEXT: movq %rbp, %rsp
;
; AVX2-LABEL: trunc_shl_16_v8i16_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: retq
%shl = shl <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%conv = trunc <8 x i32> %shl to <8 x i16>
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT: psrld $3, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: psrld $3, %xmm2
+; CHECK-SSE41-NEXT: psrld $2, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: movss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
;
; X64-LABEL: t2:
; X64: # %bb.0:
-; X64-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
+; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: retq
%X = load <4 x float>, <4 x float>* %P1
%tmp = extractelement <4 x float> %X, i32 2
; X32-LABEL: t1:
; X32: # %bb.0:
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; X32-NEXT: movaps %xmm0, %xmm2
; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
define <3 x double> @constrained_vector_frem_v3f64() {
; NO-FMA-LABEL: constrained_vector_frem_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $56, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 64
+; NO-FMA-NEXT: subq $24, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 32
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; NO-FMA-NEXT: callq fmod
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; NO-FMA-NEXT: callq fmod
-; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; NO-FMA-NEXT: callq fmod
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT: addq $56, %rsp
+; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; NO-FMA-NEXT: # xmm0 = mem[0],zero
+; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
+; NO-FMA-NEXT: # xmm1 = mem[0],zero
+; NO-FMA-NEXT: addq $24, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
define <3 x double> @constrained_vector_fma_v3f64() {
; NO-FMA-LABEL: constrained_vector_fma_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $56, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 64
+; NO-FMA-NEXT: subq $24, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 32
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; NO-FMA-NEXT: callq fma
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; NO-FMA-NEXT: callq fma
-; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; NO-FMA-NEXT: callq fma
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT: addq $56, %rsp
+; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; NO-FMA-NEXT: # xmm0 = mem[0],zero
+; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
+; NO-FMA-NEXT: # xmm1 = mem[0],zero
+; NO-FMA-NEXT: addq $24, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
define <3 x double> @constrained_vector_pow_v3f64() {
; NO-FMA-LABEL: constrained_vector_pow_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $56, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 64
+; NO-FMA-NEXT: subq $24, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 32
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; NO-FMA-NEXT: callq pow
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; NO-FMA-NEXT: callq pow
-; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; NO-FMA-NEXT: callq pow
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT: addq $56, %rsp
+; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; NO-FMA-NEXT: # xmm0 = mem[0],zero
+; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
+; NO-FMA-NEXT: # xmm1 = mem[0],zero
+; NO-FMA-NEXT: addq $24, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
define <3 x double> @constrained_vector_powi_v3f64() {
; NO-FMA-LABEL: constrained_vector_powi_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $56, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 64
+; NO-FMA-NEXT: subq $24, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 32
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movl $3, %edi
; NO-FMA-NEXT: callq __powidf2
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movl $3, %edi
; NO-FMA-NEXT: callq __powidf2
-; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movl $3, %edi
; NO-FMA-NEXT: callq __powidf2
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT: addq $56, %rsp
+; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; NO-FMA-NEXT: # xmm0 = mem[0],zero
+; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
+; NO-FMA-NEXT: # xmm1 = mem[0],zero
+; NO-FMA-NEXT: addq $24, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
define <3 x double> @constrained_vector_sin_v3f64() {
; NO-FMA-LABEL: constrained_vector_sin_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $56, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 64
+; NO-FMA-NEXT: subq $24, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 32
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq sin
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq sin
-; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq sin
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT: addq $56, %rsp
+; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; NO-FMA-NEXT: # xmm0 = mem[0],zero
+; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
+; NO-FMA-NEXT: # xmm1 = mem[0],zero
+; NO-FMA-NEXT: addq $24, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
define <3 x double> @constrained_vector_cos_v3f64() {
; NO-FMA-LABEL: constrained_vector_cos_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $56, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 64
+; NO-FMA-NEXT: subq $24, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 32
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq cos
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq cos
-; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq cos
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT: addq $56, %rsp
+; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; NO-FMA-NEXT: # xmm0 = mem[0],zero
+; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
+; NO-FMA-NEXT: # xmm1 = mem[0],zero
+; NO-FMA-NEXT: addq $24, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
define <3 x double> @constrained_vector_exp_v3f64() {
; NO-FMA-LABEL: constrained_vector_exp_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $56, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 64
+; NO-FMA-NEXT: subq $24, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 32
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq exp
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq exp
-; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq exp
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT: addq $56, %rsp
+; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; NO-FMA-NEXT: # xmm0 = mem[0],zero
+; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
+; NO-FMA-NEXT: # xmm1 = mem[0],zero
+; NO-FMA-NEXT: addq $24, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
define <3 x double> @constrained_vector_exp2_v3f64() {
; NO-FMA-LABEL: constrained_vector_exp2_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $56, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 64
+; NO-FMA-NEXT: subq $24, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 32
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq exp2
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq exp2
-; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq exp2
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT: addq $56, %rsp
+; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; NO-FMA-NEXT: # xmm0 = mem[0],zero
+; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
+; NO-FMA-NEXT: # xmm1 = mem[0],zero
+; NO-FMA-NEXT: addq $24, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
define <3 x double> @constrained_vector_log_v3f64() {
; NO-FMA-LABEL: constrained_vector_log_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $56, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 64
+; NO-FMA-NEXT: subq $24, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 32
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log
-; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT: addq $56, %rsp
+; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; NO-FMA-NEXT: # xmm0 = mem[0],zero
+; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
+; NO-FMA-NEXT: # xmm1 = mem[0],zero
+; NO-FMA-NEXT: addq $24, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
define <3 x double> @constrained_vector_log10_v3f64() {
; NO-FMA-LABEL: constrained_vector_log10_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $56, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 64
+; NO-FMA-NEXT: subq $24, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 32
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log10
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log10
-; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log10
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT: addq $56, %rsp
+; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; NO-FMA-NEXT: # xmm0 = mem[0],zero
+; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
+; NO-FMA-NEXT: # xmm1 = mem[0],zero
+; NO-FMA-NEXT: addq $24, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
define <3 x double> @constrained_vector_log2_v3f64() {
; NO-FMA-LABEL: constrained_vector_log2_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $56, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 64
+; NO-FMA-NEXT: subq $24, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 32
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log2
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log2
-; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log2
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT: addq $56, %rsp
+; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; NO-FMA-NEXT: # xmm0 = mem[0],zero
+; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
+; NO-FMA-NEXT: # xmm1 = mem[0],zero
+; NO-FMA-NEXT: addq $24, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
define <3 x double> @constrained_vector_rint_v3f64() {
; NO-FMA-LABEL: constrained_vector_rint_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $56, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 64
+; NO-FMA-NEXT: subq $24, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 32
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq rint
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq rint
-; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq rint
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT: addq $56, %rsp
+; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; NO-FMA-NEXT: # xmm0 = mem[0],zero
+; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
+; NO-FMA-NEXT: # xmm1 = mem[0],zero
+; NO-FMA-NEXT: addq $24, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
define <3 x double> @constrained_vector_nearby_v3f64() {
; NO-FMA-LABEL: constrained_vector_nearby_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $56, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 64
+; NO-FMA-NEXT: subq $24, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 32
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq nearbyint
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq nearbyint
-; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq nearbyint
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT: addq $56, %rsp
+; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; NO-FMA-NEXT: # xmm0 = mem[0],zero
+; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
+; NO-FMA-NEXT: # xmm1 = mem[0],zero
+; NO-FMA-NEXT: addq $24, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: xorps %xmm3, %xmm3
; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pslld %xmm3, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32]
; X32-SSE-NEXT: xorps %xmm2, %xmm2
; X32-SSE-NEXT: xorps %xmm3, %xmm3
; X32-SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X32-SSE-NEXT: movdqa %xmm0, %xmm4
; X32-SSE-NEXT: pslld %xmm3, %xmm4
; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32]
define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-LABEL: splatvar_rotate_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
+; SSE2-NEXT: psubw %xmm1, %xmm2
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psllw %xmm1, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16]
-; SSE2-NEXT: psubw %xmm2, %xmm1
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psrlw %xmm1, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
+; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm2, %xmm0
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
;
; X32-SSE-LABEL: splatvar_rotate_v8i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
+; X32-SSE-NEXT: psubw %xmm1, %xmm2
; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
; X32-SSE-NEXT: psllw %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16]
-; X32-SSE-NEXT: psubw %xmm2, %xmm1
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psrlw %xmm1, %xmm0
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm2, %xmm0
; X32-SSE-NEXT: por %xmm3, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
; SSE3: # %bb.0:
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; SSE3-NEXT: xorps %xmm2, %xmm2
-; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
; SSE3-NEXT: movaps %xmm2, %xmm0
; SSE3-NEXT: retq
; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; SSSE3-NEXT: xorps %xmm2, %xmm2
-; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
; SSSE3-NEXT: movaps %xmm2, %xmm0
; SSSE3-NEXT: retq
; AVX2-SLOW-LABEL: PR32160:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-FAST-LABEL: PR32160:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq