From 1584e55a2602cd9fe0db059b06a217822ffac7cd Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 4 Mar 2021 10:44:23 +0000 Subject: [PATCH] [X86] canonicalizeShuffleWithBinOps - handle general unaryshuffle(binop(x,c)) patterns not just xor(x,-1) Generalize the shuffle(not(x)) -> not(shuffle(x)) fold to handle any binop with 0/-1. Hopefully we can further generalize to help push target unary/binary shuffles through binops similar to what we do in DAGCombiner::visitVECTOR_SHUFFLE --- llvm/lib/Target/X86/X86ISelLowering.cpp | 60 +++-- llvm/test/CodeGen/X86/combine-movmsk.ll | 8 +- llvm/test/CodeGen/X86/combine-sdiv.ll | 9 +- llvm/test/CodeGen/X86/masked_compressstore.ll | 4 +- llvm/test/CodeGen/X86/masked_expandload.ll | 4 +- llvm/test/CodeGen/X86/masked_load.ll | 12 +- llvm/test/CodeGen/X86/masked_store.ll | 8 +- llvm/test/CodeGen/X86/sadd_sat_vec.ll | 328 ++++++++++++------------- llvm/test/CodeGen/X86/sar_fold64.ll | 20 +- llvm/test/CodeGen/X86/sdiv_fix.ll | 84 +++---- llvm/test/CodeGen/X86/ssub_sat_vec.ll | 250 ++++++++++--------- llvm/test/CodeGen/X86/vec_int_to_fp.ll | 66 +++-- llvm/test/CodeGen/X86/vec_saddo.ll | 6 +- llvm/test/CodeGen/X86/vector-fshr-rot-128.ll | 10 +- llvm/test/CodeGen/X86/vector-pcmp.ll | 6 +- llvm/test/CodeGen/X86/vector-shift-ashr-128.ll | 4 +- llvm/test/CodeGen/X86/vector-shift-lshr-128.ll | 2 +- 17 files changed, 449 insertions(+), 432 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4cf21cb..51f886e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36553,12 +36553,15 @@ static SDValue combineX86ShufflesRecursively( // Remove unused/repeated shuffle source ops. resolveTargetShuffleInputsAndMask(Ops, Mask); - // Handle the all undef/zero cases early. + // Handle the all undef/zero/ones cases early. if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) return DAG.getUNDEF(Root.getValueType()); if (all_of(Mask, [](int Idx) { return Idx < 0; })) return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, SDLoc(Root)); + if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) && + none_of(Mask, [](int M) { return M == SM_SentinelZero; })) + return getOnesVector(Root.getValueType(), DAG, SDLoc(Root)); assert(!Ops.empty() && "Shuffle with no inputs detected"); HasVariableMask |= IsOpVariableMask; @@ -36887,28 +36890,53 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, return SDValue(); } -// Canonicalize UNARYSHUFFLE(XOR(X,-1)) -> XOR(UNARYSHUFFLE(X),-1) to -// help expose the 'NOT' pattern further up the DAG. -// TODO: This might be beneficial for any binop with a 'splattable' operand. +// Canonicalize SHUFFLE(BINOP(X,C)) -> BINOP(SHUFFLE(X),SHUFFLE(C)). static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG, const SDLoc &DL) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT ShuffleVT = N.getValueType(); + + auto IsMergeableWithShuffle = [](SDValue Op) { + // AllZeros/AllOnes constants are freely shuffled. + return ISD::isBuildVectorAllOnes(Op.getNode()) || + ISD::isBuildVectorAllZeros(Op.getNode()); + }; + auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) { + // Ensure we only shuffle whole vector src elements, unless its logical + // binops where we can more aggressively move shuffles from dst to src. + return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR || + (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits()); + }; + unsigned Opc = N.getOpcode(); switch (Opc) { + case X86ISD::VBROADCAST: case X86ISD::MOVDDUP: case X86ISD::PSHUFD: { - SDValue N0 = N.getOperand(0); - if (N->isOnlyUserOf(N.getOperand(0).getNode())) { - if (SDValue Not = IsNOT(N0, DAG, /*OneUse*/ true)) { - Not = DAG.getBitcast(ShuffleVT, Not); - Not = Opc == X86ISD::MOVDDUP - ? DAG.getNode(Opc, DL, ShuffleVT, Not) - : DAG.getNode(Opc, DL, ShuffleVT, Not, N.getOperand(1)); - EVT IntVT = Not.getValueType().changeTypeToInteger(); - SDValue AllOnes = DAG.getConstant(-1, DL, IntVT); - Not = DAG.getBitcast(IntVT, Not); - Not = DAG.getNode(ISD::XOR, DL, IntVT, Not, AllOnes); - return DAG.getBitcast(ShuffleVT, Not); + if (N.getOperand(0).getValueType() == ShuffleVT && + N->isOnlyUserOf(N.getOperand(0).getNode())) { + SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0)); + unsigned SrcOpcode = N0.getOpcode(); + if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) { + SDValue Op00 = N0.getOperand(0); + SDValue Op01 = N0.getOperand(1); + if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) { + SDValue LHS, RHS; + Op00 = DAG.getBitcast(ShuffleVT, Op00); + Op01 = DAG.getBitcast(ShuffleVT, Op01); + if (N.getNumOperands() == 2) { + LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1)); + RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1)); + } else { + LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00); + RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01); + } + EVT OpVT = N0.getValueType(); + return DAG.getBitcast(ShuffleVT, + DAG.getNode(SrcOpcode, DL, OpVT, + DAG.getBitcast(OpVT, LHS), + DAG.getBitcast(OpVT, RHS))); + } } } break; diff --git a/llvm/test/CodeGen/X86/combine-movmsk.ll b/llvm/test/CodeGen/X86/combine-movmsk.ll index b93b747..892475d 100644 --- a/llvm/test/CodeGen/X86/combine-movmsk.ll +++ b/llvm/test/CodeGen/X86/combine-movmsk.ll @@ -65,9 +65,7 @@ define i1 @movmskps_allof_bitcast_v2f64(<2 x double> %a0) { define i1 @pmovmskb_noneof_bitcast_v2i64(<2 x i64> %a0) { ; SSE2-LABEL: pmovmskb_noneof_bitcast_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: movmskps %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: sete %al @@ -97,9 +95,7 @@ define i1 @pmovmskb_noneof_bitcast_v2i64(<2 x i64> %a0) { define i1 @pmovmskb_allof_bitcast_v2i64(<2 x i64> %a0) { ; SSE2-LABEL: pmovmskb_allof_bitcast_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: movmskps %xmm0, %eax ; SSE2-NEXT: cmpl $15, %eax ; SSE2-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 75251ce..877dcbc 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -1998,12 +1998,13 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) { ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] ; SSE2-NEXT: psrad $2, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm0[2,3] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: psubd %xmm2, %xmm3 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll index 70011bfe..e353367 100644 --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -603,10 +603,10 @@ define void @compressstore_v16f64_v16i1(double* %base, <16 x double> %V, <16 x i define void @compressstore_v2f32_v2i32(float* %base, <2 x float> %V, <2 x i32> %trigger) { ; SSE2-LABEL: compressstore_v2f32_v2i32: ; SSE2: ## %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE2-NEXT: movmskpd %xmm1, %eax +; SSE2-NEXT: movmskpd %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne LBB2_1 ; SSE2-NEXT: ## %bb.2: ## %else diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll index 53afc0a..ccae7e1 100644 --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -1117,10 +1117,10 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src define <2 x float> @expandload_v2f32_v2i1(float* %base, <2 x float> %src0, <2 x i32> %trigger) { ; SSE2-LABEL: expandload_v2f32_v2i1: ; SSE2: ## %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE2-NEXT: movmskpd %xmm1, %eax +; SSE2-NEXT: movmskpd %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne LBB4_1 ; SSE2-NEXT: ## %bb.2: ## %else diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll index 7d1e295..d6d08ac 100644 --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -783,10 +783,10 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, <8 x double>* %addr, < define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { ; SSE2-LABEL: load_v2f32_v2i32: ; SSE2: ## %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; SSE2-NEXT: movmskpd %xmm0, %eax +; SSE2-NEXT: movmskpd %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne LBB7_1 ; SSE2-NEXT: ## %bb.2: ## %else @@ -885,10 +885,10 @@ define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 define <2 x float> @load_v2f32_v2i32_undef(<2 x i32> %trigger, <2 x float>* %addr) { ; SSE2-LABEL: load_v2f32_v2i32_undef: ; SSE2: ## %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] -; SSE2-NEXT: movmskpd %xmm0, %eax +; SSE2-NEXT: movmskpd %xmm1, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: ## implicit-def: $xmm0 ; SSE2-NEXT: jne LBB8_1 @@ -2188,10 +2188,10 @@ define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, <8 x i64>* %addr, <8 x i6 define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { ; SSE2-LABEL: load_v2i32_v2i32: ; SSE2: ## %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; SSE2-NEXT: movmskpd %xmm0, %eax +; SSE2-NEXT: movmskpd %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne LBB17_1 ; SSE2-NEXT: ## %bb.2: ## %else diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index 36a2793..89955af 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -192,10 +192,10 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x dou define void @store_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { ; SSE2-LABEL: store_v2f32_v2i32: ; SSE2: ## %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; SSE2-NEXT: movmskpd %xmm0, %eax +; SSE2-NEXT: movmskpd %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne LBB3_1 ; SSE2-NEXT: ## %bb.2: ## %else @@ -1126,10 +1126,10 @@ define void @store_v1i32_v1i32(<1 x i32> %trigger, <1 x i32>* %addr, <1 x i32> % define void @store_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { ; SSE2-LABEL: store_v2i32_v2i32: ; SSE2: ## %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; SSE2-NEXT: movmskpd %xmm0, %eax +; SSE2-NEXT: movmskpd %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne LBB10_1 ; SSE2-NEXT: ## %bb.2: ## %else diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll index 9eacc45..94db1af 100644 --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -1190,20 +1190,20 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v2i64: @@ -1221,20 +1221,20 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; SSSE3-NEXT: pand %xmm5, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pxor %xmm4, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; SSSE3-NEXT: pxor %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pandn %xmm0, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 ; SSSE3-NEXT: pand {{.*}}(%rip), %xmm2 ; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i64: @@ -1324,47 +1324,47 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE2-NEXT: pand %xmm8, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 ; SSE2-NEXT: pxor %xmm6, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] ; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775807,9223372036854775807] ; SSE2-NEXT: pand %xmm9, %xmm6 ; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 ; SSE2-NEXT: paddq %xmm3, %xmm1 ; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] ; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm9, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm9, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: retq ; @@ -1384,47 +1384,47 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSSE3-NEXT: pand %xmm8, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] ; SSSE3-NEXT: por %xmm5, %xmm6 -; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 ; SSSE3-NEXT: pxor %xmm6, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, %xmm2 -; SSSE3-NEXT: pandn %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm7, %xmm5 +; SSSE3-NEXT: pandn %xmm0, %xmm5 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] ; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775807,9223372036854775807] ; SSSE3-NEXT: pand %xmm9, %xmm6 ; SSSE3-NEXT: por %xmm6, %xmm0 ; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 ; SSSE3-NEXT: paddq %xmm3, %xmm1 ; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm6 +; SSSE3-NEXT: movdqa %xmm5, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSSE3-NEXT: pandn %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] ; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm9, %xmm4 -; SSSE3-NEXT: por %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2 +; SSSE3-NEXT: pand %xmm9, %xmm2 +; SSSE3-NEXT: por %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm1 ; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: retq ; @@ -1540,20 +1540,20 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE2-NEXT: pand %xmm12, %xmm9 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm9, %xmm10 -; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] ; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3] +; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm13 ; SSE2-NEXT: pxor %xmm10, %xmm13 ; SSE2-NEXT: movdqa %xmm13, %xmm12 ; SSE2-NEXT: pandn %xmm0, %xmm12 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSE2-NEXT: pandn %xmm9, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pandn %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807] -; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807] +; SSE2-NEXT: pand %xmm10, %xmm4 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm13, %xmm0 ; SSE2-NEXT: por %xmm12, %xmm0 @@ -1570,20 +1570,20 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE2-NEXT: pand %xmm14, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm12 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pxor %xmm12, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm12 -; SSE2-NEXT: pandn %xmm1, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: por %xmm12, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pxor %xmm12, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm13 +; SSE2-NEXT: pandn %xmm1, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3] +; SSE2-NEXT: pandn %xmm9, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm12, %xmm4 +; SSE2-NEXT: pand %xmm10, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: por %xmm13, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm12 ; SSE2-NEXT: pxor %xmm8, %xmm12 ; SSE2-NEXT: paddq %xmm6, %xmm2 @@ -1597,19 +1597,19 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE2-NEXT: pand %xmm13, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pandn %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm6 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3] +; SSE2-NEXT: pandn %xmm9, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm12, %xmm5 +; SSE2-NEXT: pand %xmm10, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 @@ -1623,19 +1623,19 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE2-NEXT: pand %xmm6, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE2-NEXT: pandn %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm6 -; SSE2-NEXT: por %xmm6, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] +; SSE2-NEXT: pandn %xmm9, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm11 +; SSE2-NEXT: pand %xmm10, %xmm11 +; SSE2-NEXT: por %xmm11, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i64: @@ -1654,20 +1654,20 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSSE3-NEXT: pand %xmm12, %xmm9 ; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] ; SSSE3-NEXT: por %xmm9, %xmm10 -; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] ; SSSE3-NEXT: pxor %xmm11, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3] +; SSSE3-NEXT: pxor %xmm13, %xmm13 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm13 ; SSSE3-NEXT: pxor %xmm10, %xmm13 ; SSSE3-NEXT: movdqa %xmm13, %xmm12 ; SSSE3-NEXT: pandn %xmm0, %xmm12 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSSE3-NEXT: pandn %xmm9, %xmm0 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pandn %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807] -; SSSE3-NEXT: pand %xmm11, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807] +; SSSE3-NEXT: pand %xmm10, %xmm4 ; SSSE3-NEXT: por %xmm4, %xmm0 ; SSSE3-NEXT: pand %xmm13, %xmm0 ; SSSE3-NEXT: por %xmm12, %xmm0 @@ -1684,20 +1684,20 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSSE3-NEXT: pand %xmm14, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3] ; SSSE3-NEXT: por %xmm4, %xmm12 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pxor %xmm12, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm12 -; SSSE3-NEXT: pandn %xmm1, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 -; SSSE3-NEXT: pandn %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm11, %xmm5 -; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: por %xmm12, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 +; SSSE3-NEXT: pxor %xmm12, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm13 +; SSSE3-NEXT: pandn %xmm1, %xmm13 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3] +; SSSE3-NEXT: pandn %xmm9, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm12, %xmm4 +; SSSE3-NEXT: pand %xmm10, %xmm4 +; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: por %xmm13, %xmm1 ; SSSE3-NEXT: movdqa %xmm2, %xmm12 ; SSSE3-NEXT: pxor %xmm8, %xmm12 ; SSSE3-NEXT: paddq %xmm6, %xmm2 @@ -1711,19 +1711,19 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSSE3-NEXT: pand %xmm13, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pandn %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSSE3-NEXT: pandn %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm11, %xmm6 -; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm4 +; SSSE3-NEXT: pandn %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3] +; SSSE3-NEXT: pandn %xmm9, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm12, %xmm5 +; SSSE3-NEXT: pand %xmm10, %xmm5 +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: pand %xmm6, %xmm2 ; SSSE3-NEXT: por %xmm4, %xmm2 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSSE3-NEXT: pxor %xmm8, %xmm4 @@ -1737,19 +1737,19 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSSE3-NEXT: pand %xmm6, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm9 -; SSSE3-NEXT: pandn %xmm10, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3] -; SSSE3-NEXT: pand %xmm11, %xmm6 -; SSSE3-NEXT: por %xmm6, %xmm3 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: por %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pxor %xmm5, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm4 +; SSSE3-NEXT: pandn %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] +; SSSE3-NEXT: pandn %xmm9, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm11 +; SSSE3-NEXT: pand %xmm10, %xmm11 +; SSSE3-NEXT: por %xmm11, %xmm3 +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: por %xmm4, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v8i64: diff --git a/llvm/test/CodeGen/X86/sar_fold64.ll b/llvm/test/CodeGen/X86/sar_fold64.ll index 8b4a8f6..f597efc 100644 --- a/llvm/test/CodeGen/X86/sar_fold64.ll +++ b/llvm/test/CodeGen/X86/sar_fold64.ll @@ -102,26 +102,26 @@ define <4 x i32> @all_sign_bit_ashr_vec1(<4 x i32> %x) { ; SSE-LABEL: all_sign_bit_ashr_vec1: ; SSE: # %bb.0: ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: psubd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: psubd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: all_sign_bit_ashr_vec1: ; AVX1: # %bb.0: ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: all_sign_bit_ashr_vec1: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-NEXT: retq %and = and <4 x i32> %x, %sub = sub <4 x i32> , %and @@ -162,26 +162,26 @@ define <4 x i32> @all_sign_bit_ashr_vec3(<4 x i32> %x) { ; SSE-LABEL: all_sign_bit_ashr_vec3: ; SSE: # %bb.0: ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: all_sign_bit_ashr_vec3: ; AVX1: # %bb.0: ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: all_sign_bit_ashr_vec3: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-NEXT: retq %and = and <4 x i32> %x, %add = add <4 x i32> %and, diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll index 3079487..a547141 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix.ll @@ -443,65 +443,65 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64: # %bb.0: ; X64-NEXT: pxor %xmm2, %xmm2 ; X64-NEXT: pcmpgtd %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movdqa %xmm1, %xmm4 -; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; X64-NEXT: movq %xmm4, %rcx -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; X64-NEXT: movdqa %xmm1, %xmm3 +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X64-NEXT: movq %xmm3, %rcx +; X64-NEXT: pxor %xmm5, %xmm5 +; X64-NEXT: pcmpgtd %xmm0, %xmm5 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; X64-NEXT: psllq $31, %xmm0 ; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: cqto ; X64-NEXT: idivq %rcx ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; X64-NEXT: movq %xmm2, %rcx -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-NEXT: movq %xmm2, %rax +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; X64-NEXT: movq %xmm3, %rcx +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X64-NEXT: movq %xmm3, %rax ; X64-NEXT: cqto ; X64-NEXT: idivq %rcx ; X64-NEXT: movq %rax, %r10 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: pcmpgtd %xmm3, %xmm2 -; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X64-NEXT: movq %xmm3, %rdi -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: pcmpgtd %xmm1, %xmm2 -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: pcmpgtd %xmm4, %xmm3 +; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; X64-NEXT: movq %xmm4, %rdi +; X64-NEXT: pxor %xmm5, %xmm5 +; X64-NEXT: pcmpgtd %xmm1, %xmm5 +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; X64-NEXT: psllq $31, %xmm1 ; X64-NEXT: movq %xmm1, %rax ; X64-NEXT: cqto ; X64-NEXT: idivq %rdi ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X64-NEXT: movq %xmm2, %rsi -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-NEXT: movq %xmm2, %rax +; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X64-NEXT: movq %xmm4, %rsi +; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; X64-NEXT: movq %xmm4, %rax ; X64-NEXT: cqto ; X64-NEXT: idivq %rsi -; X64-NEXT: movq %r11, %xmm2 +; X64-NEXT: movq %r11, %xmm4 ; X64-NEXT: movq %rcx, %xmm5 ; X64-NEXT: pxor %xmm6, %xmm6 -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; X64-NEXT: pcmpeqd %xmm6, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,0,3,2] -; X64-NEXT: pand %xmm2, %xmm5 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: pcmpgtd %xmm4, %xmm2 +; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; X64-NEXT: pcmpeqd %xmm6, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2] +; X64-NEXT: pand %xmm4, %xmm5 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; X64-NEXT: pxor %xmm4, %xmm4 -; X64-NEXT: pcmpgtd %xmm0, %xmm4 +; X64-NEXT: pcmpgtd %xmm2, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: pcmpgtd %xmm0, %xmm2 ; X64-NEXT: movq %r8, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; X64-NEXT: pxor %xmm2, %xmm4 -; X64-NEXT: movq %r10, %xmm2 -; X64-NEXT: pandn %xmm4, %xmm5 -; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X64-NEXT: pxor %xmm4, %xmm2 +; X64-NEXT: movq %r10, %xmm4 +; X64-NEXT: pandn %xmm2, %xmm5 +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; X64-NEXT: movdqa %xmm5, %xmm2 ; X64-NEXT: pandn %xmm0, %xmm2 ; X64-NEXT: pcmpeqd %xmm4, %xmm4 @@ -514,13 +514,13 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: pcmpeqd %xmm6, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,0,3,2] ; X64-NEXT: pand %xmm2, %xmm5 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: pcmpgtd %xmm3, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: pcmpgtd %xmm2, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X64-NEXT: pcmpgtd %xmm1, %xmm6 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; X64-NEXT: pxor %xmm2, %xmm1 -; X64-NEXT: pandn %xmm1, %xmm5 +; X64-NEXT: pxor %xmm3, %xmm6 +; X64-NEXT: pandn %xmm6, %xmm5 ; X64-NEXT: movq %r9, %xmm1 ; X64-NEXT: movq %rax, %xmm2 ; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll index 484a8bb..0f434e7 100644 --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -1257,17 +1257,16 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pandn {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v2i64: @@ -1296,17 +1295,16 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm2 ; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm3 -; SSSE3-NEXT: por %xmm3, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pandn %xmm0, %xmm2 -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pandn %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm4 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i64: @@ -1423,47 +1421,47 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE2-NEXT: movdqa %xmm7, %xmm4 ; SSE2-NEXT: pandn %xmm0, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] ; SSE2-NEXT: pandn %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807] -; SSE2-NEXT: pand %xmm10, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm10, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: pand %xmm7, %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: psubq %xmm3, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-NEXT: psubq %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] ; SSE2-NEXT: pandn %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm10, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: retq ; @@ -1496,47 +1494,47 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSSE3-NEXT: movdqa %xmm7, %xmm4 ; SSSE3-NEXT: pandn %xmm0, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] ; SSSE3-NEXT: pandn %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 ; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807] -; SSSE3-NEXT: pand %xmm10, %xmm2 -; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm10, %xmm5 +; SSSE3-NEXT: por %xmm5, %xmm0 ; SSSE3-NEXT: pand %xmm7, %xmm0 ; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pxor %xmm8, %xmm2 -; SSSE3-NEXT: psubq %xmm3, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: pxor %xmm8, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSSE3-NEXT: psubq %xmm3, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm5 +; SSSE3-NEXT: pxor %xmm8, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: pxor %xmm8, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSSE3-NEXT: pandn %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] ; SSSE3-NEXT: pandn %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm4 -; SSSE3-NEXT: por %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2 +; SSSE3-NEXT: pand %xmm10, %xmm2 +; SSSE3-NEXT: por %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm1 ; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: retq ; @@ -1682,21 +1680,21 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3] ; SSE2-NEXT: pand %xmm11, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] -; SSE2-NEXT: por %xmm12, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm12, %xmm13 +; SSE2-NEXT: pxor %xmm10, %xmm13 +; SSE2-NEXT: movdqa %xmm13, %xmm12 ; SSE2-NEXT: pandn %xmm0, %xmm12 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] ; SSE2-NEXT: pandn %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,3,3] +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807] -; SSE2-NEXT: pand %xmm10, %xmm13 -; SSE2-NEXT: por %xmm13, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm10, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm13, %xmm0 ; SSE2-NEXT: por %xmm12, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm12 ; SSE2-NEXT: pxor %xmm8, %xmm12 @@ -1721,16 +1719,16 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm4 ; SSE2-NEXT: pxor %xmm12, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm12 -; SSE2-NEXT: pandn %xmm1, %xmm12 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm13 +; SSE2-NEXT: pandn %xmm1, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3] ; SSE2-NEXT: pandn %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm12, %xmm5 ; SSE2-NEXT: pand %xmm10, %xmm5 ; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: por %xmm12, %xmm1 +; SSE2-NEXT: por %xmm13, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm12 ; SSE2-NEXT: pxor %xmm8, %xmm12 ; SSE2-NEXT: psubq %xmm6, %xmm2 @@ -1756,10 +1754,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE2-NEXT: pxor %xmm4, %xmm5 ; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3] ; SSE2-NEXT: pandn %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm12, %xmm6 ; SSE2-NEXT: pand %xmm10, %xmm6 ; SSE2-NEXT: por %xmm6, %xmm2 ; SSE2-NEXT: pand %xmm5, %xmm2 @@ -1789,11 +1787,11 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE2-NEXT: pxor %xmm5, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] ; SSE2-NEXT: pandn %xmm9, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm6 -; SSE2-NEXT: por %xmm6, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm11 +; SSE2-NEXT: pand %xmm10, %xmm11 +; SSE2-NEXT: por %xmm11, %xmm3 ; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: retq @@ -1821,21 +1819,21 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3] ; SSSE3-NEXT: pand %xmm11, %xmm12 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] -; SSSE3-NEXT: por %xmm12, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm12, %xmm13 +; SSSE3-NEXT: pxor %xmm10, %xmm13 +; SSSE3-NEXT: movdqa %xmm13, %xmm12 ; SSSE3-NEXT: pandn %xmm0, %xmm12 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: pxor %xmm11, %xmm11 -; SSSE3-NEXT: pxor %xmm10, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] ; SSSE3-NEXT: pandn %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,3,3] +; SSSE3-NEXT: pxor %xmm11, %xmm11 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807] -; SSSE3-NEXT: pand %xmm10, %xmm13 -; SSSE3-NEXT: por %xmm13, %xmm0 -; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pand %xmm10, %xmm4 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: pand %xmm13, %xmm0 ; SSSE3-NEXT: por %xmm12, %xmm0 ; SSSE3-NEXT: movdqa %xmm1, %xmm12 ; SSSE3-NEXT: pxor %xmm8, %xmm12 @@ -1860,16 +1858,16 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm5, %xmm4 ; SSSE3-NEXT: pxor %xmm12, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm12 -; SSSE3-NEXT: pandn %xmm1, %xmm12 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm13 +; SSSE3-NEXT: pandn %xmm1, %xmm13 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3] ; SSSE3-NEXT: pandn %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm12, %xmm5 ; SSSE3-NEXT: pand %xmm10, %xmm5 ; SSSE3-NEXT: por %xmm5, %xmm1 ; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: por %xmm12, %xmm1 +; SSSE3-NEXT: por %xmm13, %xmm1 ; SSSE3-NEXT: movdqa %xmm2, %xmm12 ; SSSE3-NEXT: pxor %xmm8, %xmm12 ; SSSE3-NEXT: psubq %xmm6, %xmm2 @@ -1895,10 +1893,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSSE3-NEXT: pxor %xmm4, %xmm5 ; SSSE3-NEXT: movdqa %xmm5, %xmm4 ; SSSE3-NEXT: pandn %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3] ; SSSE3-NEXT: pandn %xmm9, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm12, %xmm6 ; SSSE3-NEXT: pand %xmm10, %xmm6 ; SSSE3-NEXT: por %xmm6, %xmm2 ; SSSE3-NEXT: pand %xmm5, %xmm2 @@ -1928,11 +1926,11 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSSE3-NEXT: pxor %xmm5, %xmm4 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 ; SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] ; SSSE3-NEXT: pandn %xmm9, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm6 -; SSSE3-NEXT: por %xmm6, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm11 +; SSSE3-NEXT: pand %xmm10, %xmm11 +; SSSE3-NEXT: por %xmm11, %xmm3 ; SSSE3-NEXT: pand %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm5, %xmm3 ; SSSE3-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 05c6a799..9361af1 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -1893,24 +1893,21 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1] ; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: psrlq $1, %xmm4 -; SSE41-NEXT: por %xmm1, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: pextrq $1, %xmm2, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: movq %xmm2, %rax -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],zero,zero -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: addps %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] -; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: psrlq $1, %xmm2 +; SSE41-NEXT: por %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm3 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],zero,zero +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: addps %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_2i64_to_4f32: @@ -2011,24 +2008,21 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) { ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1] ; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: psrlq $1, %xmm4 -; SSE41-NEXT: por %xmm1, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: pextrq $1, %xmm2, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: movq %xmm2, %rax -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],zero,zero -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: addps %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] -; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero +; SSE41-NEXT: psrlq $1, %xmm2 +; SSE41-NEXT: por %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm3 +; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],xmm2[0],zero,zero +; SSE41-NEXT: movaps %xmm3, %xmm2 +; SSE41-NEXT: addps %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm3[0],zero ; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_2i64_to_2f32: diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll index c341db6..ede2019 100644 --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -778,11 +778,11 @@ define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pxor %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, (%rdi) ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index ec30dba..ea795d9 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -868,9 +868,9 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind ; ; XOPAVX2-LABEL: splatvar_funnnel_v2i64: ; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -1007,17 +1007,17 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i32: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i32: ; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -1169,9 +1169,9 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; XOPAVX2-LABEL: splatvar_funnnel_v8i16: ; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -1410,9 +1410,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i8: ; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll index a6a0f01..656f678 100644 --- a/llvm/test/CodeGen/X86/vector-pcmp.ll +++ b/llvm/test/CodeGen/X86/vector-pcmp.ll @@ -61,9 +61,9 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %x) { define <2 x i64> @test_pcmpgtq(<2 x i64> %x) { ; SSE2-LABEL: test_pcmpgtq: ; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: retq ; ; SSE42-LABEL: test_pcmpgtq: @@ -186,11 +186,11 @@ define <8 x i32> @test_pcmpgtd_256(<8 x i32> %x) { define <4 x i64> @test_pcmpgtq_256(<4 x i64> %x) { ; SSE2-LABEL: test_pcmpgtq_256: ; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE42-LABEL: test_pcmpgtq_256: diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll index 46d2d97..ab84a58 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -653,9 +653,9 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; XOPAVX2-LABEL: splatvar_shift_v2i64: ; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -855,9 +855,9 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; XOPAVX2-LABEL: splatvar_shift_v16i8: ; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll index 61908e2..7cea8eb 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -705,9 +705,9 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; XOPAVX2-LABEL: splatvar_shift_v16i8: ; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; -- 2.7.4