From f819e4c7d0f6efef3cc1042cc45582320bf6c0a2 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 4 Aug 2021 16:46:29 +0300 Subject: [PATCH] [X86] combineX86ShuffleChain(): canonicalize mask elts picking from splats Given a shuffle mask, if it is picking from an input that is splat given the current granularity of the shuffle, then adjust the mask to pick from the same lane of the input as the mask element is in. This may result in a shuffle being simplified into a blend. I believe this is correct given that the splat detection matches the one just above the new code, My basic thought is that we might be able to get less regressions by handling multiple insertions of the same value into a vector if we form broadcasts+blend here, as opposed to D105390, but i have not really thought this through, and did not try implementing it yet. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D107009 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 43 +++++++++++++- llvm/test/CodeGen/X86/avx.ll | 16 ++--- .../CodeGen/X86/avx512-shuffles/partial_permute.ll | 4 +- llvm/test/CodeGen/X86/pr15296.ll | 27 ++------- llvm/test/CodeGen/X86/sse41.ll | 68 +++++++++++----------- llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll | 4 +- 6 files changed, 92 insertions(+), 70 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 144c81b..b435f13 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -35797,6 +35797,19 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, (RootVT.isFloatingPoint() && Depth >= 1) || (RootVT.is256BitVector() && !Subtarget.hasAVX2()); + // How many elements does each of the inputs have, given the current + // granularity of the root shuffle? Note that while currently the sizes of an + // inputs must match the size of the shuffle root, + // that restriction will be lifted in the future. + SmallVector InputNumElts; + llvm::transform(std::initializer_list({VT1, VT2}), + std::back_inserter(InputNumElts), + [BaseMaskEltSizeInBits](MVT VT) { + assert(VT.getSizeInBits() % BaseMaskEltSizeInBits == 0 && + "Input is not a multiple of output element width?"); + return VT.getSizeInBits() / BaseMaskEltSizeInBits; + }); + // Don't combine if we are a AVX512/EVEX target and the mask element size // is different from the root element size - this would prevent writemasks // from being reused. @@ -35811,12 +35824,38 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // If we are shuffling a broadcast (and not introducing zeros) then // we can just use the broadcast directly. This works for smaller broadcast // elements as well as they already repeat across each mask element - if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) && - (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 && + SmallVector InputIsSplat; + llvm::transform( + std::initializer_list({V1, V2}), + std::back_inserter(InputIsSplat), [BaseMaskEltSizeInBits](SDValue V) { + return isTargetShuffleSplat(V) && + (BaseMaskEltSizeInBits % V.getScalarValueSizeInBits()) == 0; + }); + if (UnaryShuffle && InputIsSplat[0] && !isAnyZero(BaseMask) && V1.getValueSizeInBits() >= RootSizeInBits) { return CanonicalizeShuffleInput(RootVT, V1); } + // Adjust mask elements that pick from a splat input to be identity mask elts, + // i.e. to pick from the same lane of the input as the mask element is in. + // This may allow to simplify the shuffle into a blend. + SmallVector NewMask; + if (InputIsSplat[0] || InputIsSplat[1]) { + NewMask.assign(BaseMask.begin(), BaseMask.end()); + for (unsigned i = 0; i != NumBaseMaskElts; ++i) { + int &M = NewMask[i]; + assert(isUndefOrZeroOrInRange(M, 0, 2 * NumBaseMaskElts) && + "OOB mask element?"); + if (M < 0) + continue; // Keep the undef/zero mask elements as-is. + int InputIdx = (unsigned)M < NumBaseMaskElts ? 0 : 1; + // Is the used input wide-enough to contain that lane, and is it a splat? + if (InputIsSplat[InputIdx] && i < InputNumElts[InputIdx]) + M = i + InputIdx * NumBaseMaskElts; // Pick from the same lane of input. + } + BaseMask = std::move(NewMask); + } + // See if the shuffle is a hidden identity shuffle - repeated args in HOPs // etc. can be simplified. if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) { diff --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll index a176edb..b542f17 100644 --- a/llvm/test/CodeGen/X86/avx.ll +++ b/llvm/test/CodeGen/X86/avx.ll @@ -153,11 +153,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 -; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] -; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] +; X32-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0] -; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0] +; X32-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3] +; X32-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3] ; X32-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X32-NEXT: retl @@ -165,11 +165,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X64-LABEL: insertps_from_broadcast_multiple_use: ; X64: ## %bb.0: ; X64-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 -; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] -; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] +; X64-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0] -; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0] +; X64-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3] +; X64-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3] ; X64-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index 7168209..a763f92 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -4315,7 +4315,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,4,2,6] +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,5,2,7] ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3 ; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 @@ -4340,7 +4340,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x doub ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,6] +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,5,2,7] ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z} diff --git a/llvm/test/CodeGen/X86/pr15296.ll b/llvm/test/CodeGen/X86/pr15296.ll index 71034f6..f957557 100644 --- a/llvm/test/CodeGen/X86/pr15296.ll +++ b/llvm/test/CodeGen/X86/pr15296.ll @@ -26,28 +26,11 @@ allocas: define <8 x i32> @shiftInput___canonical(<8 x i32> %input, i32 %shiftval, <8 x i32> %__mask) nounwind { ; CHECK-LABEL: shiftInput___canonical: ; CHECK: # %bb.0: # %allocas -; CHECK-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm1 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpsrld %xmm2, %xmm3, %xmm4 -; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm5 -; CHECK-NEXT: vpsrld %xmm5, %xmm3, %xmm6 -; CHECK-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] -; CHECK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; CHECK-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm6[2,3,4,5,6,7] -; CHECK-NEXT: vpsrld %xmm6, %xmm3, %xmm7 -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; CHECK-NEXT: vpsrld %xmm1, %xmm3, %xmm3 -; CHECK-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; CHECK-NEXT: vpsrld %xmm2, %xmm0, %xmm2 -; CHECK-NEXT: vpsrld %xmm5, %xmm0, %xmm4 -; CHECK-NEXT: vpsrld %xmm6, %xmm0, %xmm5 -; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: vpsrld %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpsrld %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: retl allocas: %smear.0 = insertelement <8 x i32> undef, i32 %shiftval, i32 0 diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index 17aae33..1a1b976 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -1661,15 +1661,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] ; X86-AVX1-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81] -; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30] -; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0] -; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30] -; X86-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0] +; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08] +; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3] +; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08] +; X86-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3] ; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] -; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30] -; X86-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0] -; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30] -; X86-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0] +; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08] +; X86-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[3] +; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08] +; X86-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[3] ; X86-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca] ; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] @@ -1679,16 +1679,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] ; X86-AVX512-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81] -; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30] -; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0] -; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30] -; X86-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0] +; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08] +; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3] +; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08] +; X86-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3] +; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08] +; X86-AVX512-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[3] +; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08] +; X86-AVX512-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[3] ; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] -; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30] -; X86-AVX512-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0] -; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30] -; X86-AVX512-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0] -; X86-AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] +; X86-AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb] ; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; @@ -1712,15 +1712,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use: ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7] -; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30] -; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0] -; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30] -; X64-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0] +; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08] +; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3] +; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08] +; X64-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3] ; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] -; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30] -; X64-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0] -; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30] -; X64-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0] +; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08] +; X64-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[3] +; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08] +; X64-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[3] ; X64-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca] ; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] @@ -1728,16 +1728,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7] -; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30] -; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0] -; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30] -; X64-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0] +; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08] +; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3] +; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08] +; X64-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3] +; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08] +; X64-AVX512-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[3] +; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08] +; X64-AVX512-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[3] ; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] -; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30] -; X64-AVX512-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0] -; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30] -; X64-AVX512-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0] -; X64-AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] +; X64-AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb] ; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %1 = getelementptr inbounds float, float* %fb, i64 %index diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index d280580..96bcaa1 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -4591,14 +4591,14 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_ ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: retq ; ; AVX512VLBW-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u] -; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: -- 2.7.4