From 18c19441d10512e2b37ce829743443eed009147f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 10 Mar 2020 10:44:12 +0000 Subject: [PATCH] [X86][AVX] combineX86ShuffleChain - combine binary shuffles to X86ISD::VPERM2X128 For pre-AVX512 targets, combine binary shuffles to X86ISD::VPERM2X128 if possible. This mainly helps optimize the blend(extract_subvector(x,1),y) pattern. At some point soon we're going to have make a decision about when to combine AVX512 shuffles more aggressively - we bail out if there is any change in element size (to protect predicate mask merging) which means we miss out on a lot of optimizations. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 52 ++++++++++++++++++------- llvm/test/CodeGen/X86/avg.ll | 15 ++++--- llvm/test/CodeGen/X86/masked_store_trunc.ll | 9 ++--- llvm/test/CodeGen/X86/pr34592.ll | 49 +++++++++++------------ llvm/test/CodeGen/X86/vector-reduce-and-bool.ll | 9 ++--- llvm/test/CodeGen/X86/vector-reduce-or-bool.ll | 9 ++--- llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll | 9 ++--- llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll | 12 +++--- llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll | 24 ++++++------ llvm/test/CodeGen/X86/vector-trunc.ll | 9 ++--- 10 files changed, 107 insertions(+), 90 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ab677a0..e4533e0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -33868,24 +33868,46 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // TODO - handle 128/256-bit lane shuffles of 512-bit vectors. // Handle 128-bit lane shuffles of 256-bit vectors. - // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless - // we need to use the zeroing feature. - // TODO - this should support binary shuffles. - if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 && - !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) && - !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) { + if (RootVT.is256BitVector() && NumBaseMaskElts == 2) { if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128) return SDValue(); // Nothing to do! MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); - unsigned PermMask = 0; - PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0); - PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4); - - Res = DAG.getBitcast(ShuffleVT, V1); - Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res, - DAG.getUNDEF(ShuffleVT), - DAG.getTargetConstant(PermMask, DL, MVT::i8)); - return DAG.getBitcast(RootVT, Res); + + // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless + // we need to use the zeroing feature. + if (UnaryShuffle && + !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) && + !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) { + unsigned PermMask = 0; + PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0); + PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4); + + Res = DAG.getBitcast(ShuffleVT, V1); + Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res, + DAG.getUNDEF(ShuffleVT), + DAG.getTargetConstant(PermMask, DL, MVT::i8)); + return DAG.getBitcast(RootVT, Res); + } + + // TODO - handle AVX512VL cases with X86ISD::SHUF128. + if (!UnaryShuffle && !IsEVEXShuffle) { + assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) && + "Unexpected shuffle sentinel value"); + // Prefer blends to X86ISD::VPERM2X128. + if (!((BaseMask[0] == 0 && BaseMask[1] == 3) || + (BaseMask[0] == 2 && BaseMask[1] == 1))) { + unsigned PermMask = 0; + PermMask |= ((BaseMask[0] & 3) << 0); + PermMask |= ((BaseMask[1] & 3) << 4); + + Res = DAG.getNode( + X86ISD::VPERM2X128, DL, ShuffleVT, + DAG.getBitcast(ShuffleVT, isInRange(BaseMask[0], 0, 2) ? V1 : V2), + DAG.getBitcast(ShuffleVT, isInRange(BaseMask[1], 0, 2) ? V1 : V2), + DAG.getTargetConstant(PermMask, DL, MVT::i8)); + return DAG.getBitcast(RootVT, Res); + } + } } // For masks that have been widened to 128-bit elements or more, diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index d2e8f6d..0640141 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -377,21 +377,21 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; ; AVX2-LABEL: avg_v48i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm0 +; AVX2-NEXT: vpbroadcastq 24(%rdi), %xmm0 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpbroadcastq 24(%rdi), %xmm2 +; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm2 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpbroadcastq 40(%rdi), %xmm4 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpbroadcastq 8(%rsi), %xmm6 +; AVX2-NEXT: vpbroadcastq 24(%rsi), %xmm6 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero ; AVX2-NEXT: vpaddd %ymm6, %ymm0, %ymm0 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpaddd %ymm6, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastq 24(%rsi), %xmm6 +; AVX2-NEXT: vpbroadcastq 8(%rsi), %xmm6 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero ; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero @@ -423,10 +423,9 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX2-NEXT: vpackusdw %ymm6, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpackuswb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm0[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[2,3],ymm4[2,3] ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3 ; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index 5b2bf3b..5c8c40e 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -5146,12 +5146,11 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpackuswb %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vpmovmskb %ymm1, %eax ; AVX2-NEXT: notl %eax diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll index 9249450..0f73036 100644 --- a/llvm/test/CodeGen/X86/pr34592.ll +++ b/llvm/test/CodeGen/X86/pr34592.ll @@ -10,7 +10,7 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1 ; CHECK-NEXT: movq %rsp, %rbp ; CHECK-NEXT: .cfi_def_cfa_register %rbp ; CHECK-NEXT: andq $-32, %rsp -; CHECK-NEXT: subq $128, %rsp +; CHECK-NEXT: subq $192, %rsp ; CHECK-NEXT: vmovaps 240(%rbp), %ymm8 ; CHECK-NEXT: vmovaps 208(%rbp), %ymm9 ; CHECK-NEXT: vmovaps 176(%rbp), %ymm10 @@ -20,36 +20,37 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1 ; CHECK-NEXT: vmovaps 48(%rbp), %ymm14 ; CHECK-NEXT: vmovaps 16(%rbp), %ymm15 ; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; CHECK-NEXT: vmovaps %xmm9, %xmm6 +; CHECK-NEXT: vmovaps %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: # implicit-def: $ymm0 -; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 -; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,0] -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] -; CHECK-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2 -; CHECK-NEXT: # implicit-def: $ymm11 -; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm11, %ymm11 -; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm2 -; CHECK-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero -; CHECK-NEXT: # implicit-def: $ymm6 -; CHECK-NEXT: vmovaps %xmm2, %xmm6 -; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; CHECK-NEXT: vmovaps %xmm7, %xmm6 -; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7] -; CHECK-NEXT: # implicit-def: $ymm11 -; CHECK-NEXT: vmovaps %xmm6, %xmm11 -; CHECK-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; CHECK-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 +; CHECK-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,0] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; CHECK-NEXT: vmovaps %xmm7, %xmm9 +; CHECK-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7] +; CHECK-NEXT: # implicit-def: $ymm2 +; CHECK-NEXT: vmovaps %xmm9, %xmm2 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; CHECK-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] ; CHECK-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,3] -; CHECK-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] -; CHECK-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,1,3] +; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] +; CHECK-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,1,3] ; CHECK-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,1,4,5,4,5] -; CHECK-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm7 +; CHECK-NEXT: vmovq {{.*#+}} xmm7 = xmm7[0],zero +; CHECK-NEXT: # implicit-def: $ymm8 +; CHECK-NEXT: vmovaps %xmm7, %xmm8 +; CHECK-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm8[0,1],ymm6[0,1] ; CHECK-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm5, %ymm1 +; CHECK-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovaps %ymm6, %ymm2 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; CHECK-NEXT: vmovaps %ymm3, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovaps %ymm9, %ymm3 +; CHECK-NEXT: vmovaps %ymm5, %ymm3 ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll index 37bd440..a18e47c 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -745,12 +745,11 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) { ; AVX2-LABEL: trunc_v32i16_v32i1: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpackuswb %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 ; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: cmpl $-1, %eax diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll index 07079ec..a9d4f50 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -739,12 +739,11 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) { ; AVX2-LABEL: trunc_v32i16_v32i1: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpackuswb %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 ; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: testl %eax, %eax diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll index 1dc8397..fb2b995 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -831,12 +831,11 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) { ; AVX2-LABEL: trunc_v32i16_v32i1: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpackuswb %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 ; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: movl %eax, %ecx diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 4cdddb4..c8136f4 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -439,9 +439,9 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0 ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq ; @@ -462,9 +462,9 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0 ; ; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 1e1bf04..1d7711c 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -915,9 +915,9 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq ; @@ -945,9 +945,9 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_ ; ; XOPAVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -967,9 +967,9 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq ; @@ -997,9 +997,9 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_ ; ; XOPAVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index f3bbc15..9a1199f 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -1305,12 +1305,11 @@ define void @trunc32i16_32i8(<32 x i16> %a) { ; AVX2-LABEL: trunc32i16_32i8: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpackuswb %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq -- 2.7.4