From 4431a29c60e7c56fe17a1053f7ae55994af6fdba Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 5 Apr 2020 22:13:53 +0100 Subject: [PATCH] [X86][SSE] Combine unary shuffle(HORIZOP,HORIZOP) -> HORIZOP We had previously limited the shuffle(HORIZOP,HORIZOP) combine to binary shuffles, but we can often merge unary shuffles just as well, folding in UNDEF/ZERO values into the 64-bit half lanes. For the (P)HADD/HSUB cases this is limited to fast-horizontal cases but PACKSS/PACKUS combines under all cases. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 43 +++-- llvm/test/CodeGen/X86/haddsub-shuf.ll | 264 ++++++++++++++++++-------- llvm/test/CodeGen/X86/haddsub-undef.ll | 5 +- llvm/test/CodeGen/X86/phaddsub-undef.ll | 5 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll | 12 +- llvm/test/CodeGen/X86/vec_saddo.ll | 24 +-- llvm/test/CodeGen/X86/vec_smulo.ll | 44 ++--- llvm/test/CodeGen/X86/vec_ssubo.ll | 24 +-- llvm/test/CodeGen/X86/vec_uaddo.ll | 24 +-- llvm/test/CodeGen/X86/vec_umulo.ll | 20 +- llvm/test/CodeGen/X86/vec_usubo.ll | 24 +-- 11 files changed, 296 insertions(+), 193 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index be788a9..c4a9b85 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -35347,14 +35347,14 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, SmallVector TargetMask; SmallVector TargetOps; if (isTargetShuffle(Opcode)) - getTargetShuffleMask(N.getNode(), VT, false, TargetOps, TargetMask, IsUnary); + getTargetShuffleMask(N.getNode(), VT, true, TargetOps, TargetMask, IsUnary); // Combine binary shuffle of 2 similar 'Horizontal' instructions into a // single instruction. Attempt to match a v2X64 repeating shuffle pattern that // represents the LHS/RHS inputs for the lower/upper halves. SmallVector TargetMask128; - if (!TargetMask.empty() && TargetOps.size() == 2 && - is128BitLaneRepeatedShuffleMask(VT, TargetMask, TargetMask128)) { + if (!TargetMask.empty() && 0 < TargetOps.size() && TargetOps.size() <= 2 && + isRepeatedTargetShuffleMask(128, VT, TargetMask, TargetMask128)) { SmallVector WidenedMask128 = TargetMask128; while (WidenedMask128.size() > 2) { SmallVector WidenedMask; @@ -35362,23 +35362,36 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, break; WidenedMask128 = std::move(WidenedMask); } - if (WidenedMask128.size() == 2 && isInRange(WidenedMask128, 0, 4)) { - SDValue BC0 = peekThroughBitcasts(TargetOps[0]); - SDValue BC1 = peekThroughBitcasts(TargetOps[1]); + if (WidenedMask128.size() == 2) { + assert(isUndefOrZeroOrInRange(WidenedMask128, 0, 4) && "Illegal shuffle"); + SDValue BC0 = peekThroughBitcasts(TargetOps.front()); + SDValue BC1 = peekThroughBitcasts(TargetOps.back()); EVT VT0 = BC0.getValueType(); EVT VT1 = BC1.getValueType(); unsigned Opcode0 = BC0.getOpcode(); unsigned Opcode1 = BC1.getOpcode(); + bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD || + Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB); if (Opcode0 == Opcode1 && VT0 == VT1 && - (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD || - Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB || - Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) { - SDValue Lo = isInRange(WidenedMask128[0], 0, 2) ? BC0 : BC1; - SDValue Hi = isInRange(WidenedMask128[1], 0, 2) ? BC0 : BC1; - Lo = Lo.getOperand(WidenedMask128[0] & 1); - Hi = Hi.getOperand(WidenedMask128[1] & 1); - SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi); - return DAG.getBitcast(VT, Horiz); + (isHoriz || Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) { + bool SingleOp = (TargetOps.size() == 1); + if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) { + SDValue Lo = isInRange(WidenedMask128[0], 0, 2) ? BC0 : BC1; + SDValue Hi = isInRange(WidenedMask128[1], 0, 2) ? BC0 : BC1; + Lo = Lo.getOperand(WidenedMask128[0] & 1); + Hi = Hi.getOperand(WidenedMask128[1] & 1); + if (SingleOp) { + MVT SrcVT = BC0.getOperand(0).getSimpleValueType(); + SDValue Undef = DAG.getUNDEF(SrcVT); + SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL); + Lo = (WidenedMask128[0] == SM_SentinelZero ? Zero : Lo); + Hi = (WidenedMask128[1] == SM_SentinelZero ? Zero : Hi); + Lo = (WidenedMask128[0] == SM_SentinelUndef ? Undef : Lo); + Hi = (WidenedMask128[1] == SM_SentinelUndef ? Undef : Hi); + } + SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi); + return DAG.getBitcast(VT, Horiz); + } } } } diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll index d354811..7cf3e2d 100644 --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -27,21 +27,37 @@ define <4 x float> @hadd_v4f32(<4 x float> %a) { } define <8 x float> @hadd_v8f32a(<8 x float> %a) { -; SSSE3-LABEL: hadd_v8f32a: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movaps %xmm0, %xmm2 -; SSSE3-NEXT: haddps %xmm1, %xmm2 -; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] -; SSSE3-NEXT: movaps %xmm2, %xmm1 -; SSSE3-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v8f32a: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: haddps %xmm1, %xmm2 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] +; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm1 +; SSSE3_SLOW-NEXT: retq ; -; AVX1-LABEL: hadd_v8f32a: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; SSSE3_FAST-LABEL: hadd_v8f32a: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: movaps %xmm0, %xmm2 +; SSSE3_FAST-NEXT: haddps %xmm1, %xmm2 +; SSSE3_FAST-NEXT: haddps %xmm0, %xmm0 +; SSSE3_FAST-NEXT: movaps %xmm2, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v8f32a: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v8f32a: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm1 +; AVX1_FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1_FAST-NEXT: retq ; ; AVX2-LABEL: hadd_v8f32a: ; AVX2: # %bb.0: @@ -92,21 +108,37 @@ define <4 x float> @hsub_v4f32(<4 x float> %a) { } define <8 x float> @hsub_v8f32a(<8 x float> %a) { -; SSSE3-LABEL: hsub_v8f32a: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movaps %xmm0, %xmm2 -; SSSE3-NEXT: hsubps %xmm1, %xmm2 -; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] -; SSSE3-NEXT: movaps %xmm2, %xmm1 -; SSSE3-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v8f32a: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: hsubps %xmm1, %xmm2 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] +; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm1 +; SSSE3_SLOW-NEXT: retq ; -; AVX1-LABEL: hsub_v8f32a: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vhsubps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; SSSE3_FAST-LABEL: hsub_v8f32a: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: movaps %xmm0, %xmm2 +; SSSE3_FAST-NEXT: hsubps %xmm1, %xmm2 +; SSSE3_FAST-NEXT: hsubps %xmm0, %xmm0 +; SSSE3_FAST-NEXT: movaps %xmm2, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v8f32a: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1_SLOW-NEXT: vhsubps %xmm1, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v8f32a: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1_FAST-NEXT: vhsubps %xmm1, %xmm0, %xmm1 +; AVX1_FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1_FAST-NEXT: retq ; ; AVX2-LABEL: hsub_v8f32a: ; AVX2: # %bb.0: @@ -477,21 +509,37 @@ define <4 x i32> @hadd_v4i32(<4 x i32> %a) { } define <8 x i32> @hadd_v8i32a(<8 x i32> %a) { -; SSSE3-LABEL: hadd_v8i32a: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: phaddd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v8i32a: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: phaddd %xmm1, %xmm2 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] +; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1 +; SSSE3_SLOW-NEXT: retq ; -; AVX1-LABEL: hadd_v8i32a: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; SSSE3_FAST-LABEL: hadd_v8i32a: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2 +; SSSE3_FAST-NEXT: phaddd %xmm1, %xmm2 +; SSSE3_FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v8i32a: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1_SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v8i32a: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1_FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm1 +; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1_FAST-NEXT: retq ; ; AVX2-LABEL: hadd_v8i32a: ; AVX2: # %bb.0: @@ -551,21 +599,37 @@ define <4 x i32> @hsub_v4i32(<4 x i32> %a) { } define <8 x i32> @hsub_v8i32a(<8 x i32> %a) { -; SSSE3-LABEL: hsub_v8i32a: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: phsubd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v8i32a: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: phsubd %xmm1, %xmm2 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] +; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1 +; SSSE3_SLOW-NEXT: retq ; -; AVX1-LABEL: hsub_v8i32a: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vphsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; SSSE3_FAST-LABEL: hsub_v8i32a: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2 +; SSSE3_FAST-NEXT: phsubd %xmm1, %xmm2 +; SSSE3_FAST-NEXT: phsubd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v8i32a: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1_SLOW-NEXT: vphsubd %xmm1, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v8i32a: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1_FAST-NEXT: vphsubd %xmm1, %xmm0, %xmm1 +; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1_FAST-NEXT: retq ; ; AVX2-LABEL: hsub_v8i32a: ; AVX2: # %bb.0: @@ -625,21 +689,37 @@ define <8 x i16> @hadd_v8i16(<8 x i16> %a) { } define <16 x i16> @hadd_v16i16a(<16 x i16> %a) { -; SSSE3-LABEL: hadd_v16i16a: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: phaddw %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v16i16a: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: phaddw %xmm1, %xmm2 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] +; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1 +; SSSE3_SLOW-NEXT: retq ; -; AVX1-LABEL: hadd_v16i16a: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; SSSE3_FAST-LABEL: hadd_v16i16a: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2 +; SSSE3_FAST-NEXT: phaddw %xmm1, %xmm2 +; SSSE3_FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v16i16a: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1_SLOW-NEXT: vphaddw %xmm1, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v16i16a: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1_FAST-NEXT: vphaddw %xmm1, %xmm0, %xmm1 +; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1_FAST-NEXT: retq ; ; AVX2-LABEL: hadd_v16i16a: ; AVX2: # %bb.0: @@ -699,21 +779,37 @@ define <8 x i16> @hsub_v8i16(<8 x i16> %a) { } define <16 x i16> @hsub_v16i16a(<16 x i16> %a) { -; SSSE3-LABEL: hsub_v16i16a: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: phsubw %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v16i16a: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: phsubw %xmm1, %xmm2 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] +; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1 +; SSSE3_SLOW-NEXT: retq ; -; AVX1-LABEL: hsub_v16i16a: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vphsubw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; SSSE3_FAST-LABEL: hsub_v16i16a: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2 +; SSSE3_FAST-NEXT: phsubw %xmm1, %xmm2 +; SSSE3_FAST-NEXT: phsubw %xmm0, %xmm0 +; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v16i16a: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1_SLOW-NEXT: vphsubw %xmm1, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v16i16a: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1_FAST-NEXT: vphsubw %xmm1, %xmm0, %xmm1 +; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1_FAST-NEXT: retq ; ; AVX2-LABEL: hsub_v16i16a: ; AVX2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll index 83e7eb4..d2eaa72 100644 --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -284,9 +284,9 @@ define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) { ; ; SSE-FAST-LABEL: test11_undef: ; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: movaps %xmm3, %xmm1 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0 -; SSE-FAST-NEXT: haddps %xmm3, %xmm3 -; SSE-FAST-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0] +; SSE-FAST-NEXT: haddps %xmm3, %xmm1 ; SSE-FAST-NEXT: retq ; ; AVX-LABEL: test11_undef: @@ -490,7 +490,6 @@ define <2 x double> @add_pd_010(<2 x double> %x) { ; AVX-FAST-LABEL: add_pd_010: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-FAST-NEXT: retq %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %add = fadd <2 x double> %l, %x diff --git a/llvm/test/CodeGen/X86/phaddsub-undef.ll b/llvm/test/CodeGen/X86/phaddsub-undef.ll index b0be5c7..24d2aca 100644 --- a/llvm/test/CodeGen/X86/phaddsub-undef.ll +++ b/llvm/test/CodeGen/X86/phaddsub-undef.ll @@ -50,9 +50,9 @@ define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) { ; ; SSE-FAST-LABEL: test15_undef: ; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: movdqa %xmm3, %xmm1 ; SSE-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSE-FAST-NEXT: phaddd %xmm3, %xmm3 -; SSE-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] +; SSE-FAST-NEXT: phaddd %xmm3, %xmm1 ; SSE-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: test15_undef: @@ -75,7 +75,6 @@ define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) { ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll index 72465f8..361069e 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -747,8 +747,7 @@ define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind { ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -759,8 +758,7 @@ define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind { ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -943,8 +941,7 @@ define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind { ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -955,8 +952,7 @@ define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind { ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll index 7854398..46347df 100644 --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -512,15 +512,15 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3] -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpacksswb %xmm1, %xmm4, %xmm0 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 ; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi) @@ -544,9 +544,9 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) ; AVX2-NEXT: vmovdqa %ymm2, (%rdi) diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index b2639ca..dc72917 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -1156,8 +1156,8 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpmulld %xmm4, %xmm6, %xmm8 ; AVX1-NEXT: vpsrad $31, %xmm8, %xmm6 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9 +; AVX1-NEXT: vpxor %xmm6, %xmm9, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; AVX1-NEXT: vpmuldq %xmm7, %xmm4, %xmm4 @@ -1167,20 +1167,20 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpsrad $31, %xmm3, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm9 +; AVX1-NEXT: vpxor %xmm1, %xmm9, %xmm1 +; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; AVX1-NEXT: vpmuldq %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; AVX1-NEXT: vpmuldq %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpmuldq %xmm4, %xmm7, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3],xmm6[4,5],xmm1[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7] ; AVX1-NEXT: vpmulld %xmm4, %xmm7, %xmm4 ; AVX1-NEXT: vpsrad $31, %xmm4, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuldq %xmm6, %xmm7, %xmm6 @@ -1190,16 +1190,16 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm9, %xmm0, %xmm1 -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,3] -; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpxor %xmm0, %xmm9, %xmm0 +; AVX1-NEXT: vpackssdw %xmm5, %xmm0, %xmm5 +; AVX1-NEXT: vpacksswb %xmm1, %xmm5, %xmm0 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-NEXT: vpacksswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 ; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi) @@ -1235,9 +1235,9 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) ; AVX2-NEXT: vmovdqa %ymm2, (%rdi) diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll index 48834a7..1c99eff 100644 --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -517,15 +517,15 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3] -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpacksswb %xmm1, %xmm4, %xmm0 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 ; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi) @@ -549,9 +549,9 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) ; AVX2-NEXT: vmovdqa %ymm2, (%rdi) diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index 3714b7e..7539667 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -601,15 +601,15 @@ define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,2,3] -; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm6 +; AVX1-NEXT: vpacksswb %xmm1, %xmm6, %xmm0 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 +; AVX1-NEXT: vpacksswb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 ; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi) @@ -633,9 +633,9 @@ define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) ; AVX2-NEXT: vmovdqa %ymm2, (%rdi) diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 0ad11b5..10ffdca 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -1006,20 +1006,20 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm5 ; AVX1-NEXT: vpackssdw %xmm13, %xmm5, %xmm5 -; AVX1-NEXT: vpacksswb %xmm11, %xmm5, %xmm5 +; AVX1-NEXT: vpacksswb %xmm11, %xmm5, %xmm7 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpmulld %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm6 -; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm7, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,0,1] +; AVX1-NEXT: vpacksswb %xmm5, %xmm11, %xmm1 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,0,1] -; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 ; AVX1-NEXT: vmovdqa %xmm6, 48(%rdi) ; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) ; AVX1-NEXT: vmovdqa %xmm4, 16(%rdi) @@ -1050,11 +1050,11 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpacksswb %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vpacksswb %xmm0, %xmm5, %xmm5 ; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpmovsxbd %xmm4, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,0,1] +; AVX2-NEXT: vpmovsxbd %xmm5, %ymm0 +; AVX2-NEXT: vpacksswb %xmm0, %xmm4, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) ; AVX2-NEXT: vmovdqa %ymm2, (%rdi) diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index f4c67ec..18623f4 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -644,15 +644,15 @@ define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpminud %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,2,3] -; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm6 +; AVX1-NEXT: vpacksswb %xmm1, %xmm6, %xmm0 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 +; AVX1-NEXT: vpacksswb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 ; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi) @@ -676,9 +676,9 @@ define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) ; AVX2-NEXT: vmovdqa %ymm2, (%rdi) -- 2.7.4