From f819e4c7d0f6efef3cc1042cc45582320bf6c0a2 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Wed, 4 Aug 2021 16:46:29 +0300
Subject: [PATCH] [X86] combineX86ShuffleChain(): canonicalize mask elts
 picking from splats

Given a shuffle mask, if it is picking from an input that is splat
given the current granularity of the shuffle, then adjust the mask
to pick from the same lane of the input as the mask element is in.
This may result in a shuffle being simplified into a blend.

I believe this is correct given that the splat detection matches the one
just above the new code,

My basic thought is that we might be able to get less regressions
by handling multiple insertions of the same value into a vector
if we form broadcasts+blend here, as opposed to D105390,
but i have not really thought this through,
and did not try implementing it yet.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D107009
---
 llvm/lib/Target/X86/X86ISelLowering.cpp            | 43 +++++++++++++-
 llvm/test/CodeGen/X86/avx.ll                       | 16 ++---
 .../CodeGen/X86/avx512-shuffles/partial_permute.ll |  4 +-
 llvm/test/CodeGen/X86/pr15296.ll                   | 27 ++-------
 llvm/test/CodeGen/X86/sse41.ll                     | 68 +++++++++++-----------
 llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll    |  4 +-
 6 files changed, 92 insertions(+), 70 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 144c81b..b435f13 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35797,6 +35797,19 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                      (RootVT.isFloatingPoint() && Depth >= 1) ||
                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
 
+  // How many elements does each of the inputs have, given the current
+  // granularity of the root shuffle? Note that while currently the sizes of an
+  // inputs must match the size of the shuffle root,
+  // that restriction will be lifted in the future.
+  SmallVector<unsigned, 2> InputNumElts;
+  llvm::transform(std::initializer_list<MVT>({VT1, VT2}),
+                  std::back_inserter(InputNumElts),
+                  [BaseMaskEltSizeInBits](MVT VT) {
+                    assert(VT.getSizeInBits() % BaseMaskEltSizeInBits == 0 &&
+                           "Input is not a multiple of output element width?");
+                    return VT.getSizeInBits() / BaseMaskEltSizeInBits;
+                  });
+
   // Don't combine if we are a AVX512/EVEX target and the mask element size
   // is different from the root element size - this would prevent writemasks
   // from being reused.
@@ -35811,12 +35824,38 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   // If we are shuffling a broadcast (and not introducing zeros) then
   // we can just use the broadcast directly. This works for smaller broadcast
   // elements as well as they already repeat across each mask element
-  if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
-      (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
+  SmallVector<bool, 2> InputIsSplat;
+  llvm::transform(
+      std::initializer_list<SDValue>({V1, V2}),
+      std::back_inserter(InputIsSplat), [BaseMaskEltSizeInBits](SDValue V) {
+        return isTargetShuffleSplat(V) &&
+               (BaseMaskEltSizeInBits % V.getScalarValueSizeInBits()) == 0;
+      });
+  if (UnaryShuffle && InputIsSplat[0] && !isAnyZero(BaseMask) &&
       V1.getValueSizeInBits() >= RootSizeInBits) {
     return CanonicalizeShuffleInput(RootVT, V1);
   }
 
+  // Adjust mask elements that pick from a splat input to be identity mask elts,
+  // i.e. to pick from the same lane of the input as the mask element is in.
+  // This may allow to simplify the shuffle into a blend.
+  SmallVector<int> NewMask;
+  if (InputIsSplat[0] || InputIsSplat[1]) {
+    NewMask.assign(BaseMask.begin(), BaseMask.end());
+    for (unsigned i = 0; i != NumBaseMaskElts; ++i) {
+      int &M = NewMask[i];
+      assert(isUndefOrZeroOrInRange(M, 0, 2 * NumBaseMaskElts) &&
+             "OOB mask element?");
+      if (M < 0)
+        continue; // Keep the undef/zero mask elements as-is.
+      int InputIdx = (unsigned)M < NumBaseMaskElts ? 0 : 1;
+      // Is the used input wide-enough to contain that lane, and is it a splat?
+      if (InputIsSplat[InputIdx] && i < InputNumElts[InputIdx])
+        M = i + InputIdx * NumBaseMaskElts; // Pick from the same lane of input.
+    }
+    BaseMask = std::move(NewMask);
+  }
+
   // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
   // etc. can be simplified.
   if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {
diff --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll
index a176edb..b542f17 100644
--- a/llvm/test/CodeGen/X86/avx.ll
+++ b/llvm/test/CodeGen/X86/avx.ll
@@ -153,11 +153,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
-; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X32-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; X32-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; X32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
-; X32-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
+; X32-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
+; X32-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
 ; X32-NEXT:    vaddps %xmm2, %xmm1, %xmm1
 ; X32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    retl
@@ -165,11 +165,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X64-LABEL: insertps_from_broadcast_multiple_use:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4
-; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
-; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; X64-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
-; X64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
+; X64-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
+; X64-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
 ; X64-NEXT:    vaddps %xmm2, %xmm1, %xmm1
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 7168209..a763f92 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -4315,7 +4315,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [3,4,2,6]
+; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [3,5,2,7]
 ; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
@@ -4340,7 +4340,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x doub
 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm1 = [3,4,2,6]
+; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm1 = [3,5,2,7]
 ; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
 ; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}
diff --git a/llvm/test/CodeGen/X86/pr15296.ll b/llvm/test/CodeGen/X86/pr15296.ll
index 71034f6..f957557 100644
--- a/llvm/test/CodeGen/X86/pr15296.ll
+++ b/llvm/test/CodeGen/X86/pr15296.ll
@@ -26,28 +26,11 @@ allocas:
 define <8 x i32> @shiftInput___canonical(<8 x i32> %input, i32 %shiftval, <8 x i32> %__mask) nounwind {
 ; CHECK-LABEL: shiftInput___canonical:
 ; CHECK:       # %bb.0: # %allocas
-; CHECK-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm1
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; CHECK-NEXT:    vpsrld %xmm2, %xmm3, %xmm4
-; CHECK-NEXT:    vpsrlq $32, %xmm1, %xmm5
-; CHECK-NEXT:    vpsrld %xmm5, %xmm3, %xmm6
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
-; CHECK-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm6[2,3,4,5,6,7]
-; CHECK-NEXT:    vpsrld %xmm6, %xmm3, %xmm7
-; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; CHECK-NEXT:    vpsrld %xmm1, %xmm3, %xmm3
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
-; CHECK-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
-; CHECK-NEXT:    vpsrld %xmm5, %xmm0, %xmm4
-; CHECK-NEXT:    vpsrld %xmm6, %xmm0, %xmm5
-; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vpsrld %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpsrld %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
 allocas:
   %smear.0 = insertelement <8 x i32> undef, i32 %shiftval, i32 0
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 17aae33..1a1b976 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -1661,15 +1661,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-AVX1-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
-; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
-; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
-; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
-; X86-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
+; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
+; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
+; X86-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
 ; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
-; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
-; X86-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
-; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
-; X86-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
+; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
+; X86-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[3]
+; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
+; X86-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[3]
 ; X86-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
 ; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
@@ -1679,16 +1679,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-AVX512-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
-; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
-; X86-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
+; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
+; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
+; X86-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
+; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
+; X86-AVX512-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[3]
+; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
+; X86-AVX512-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[3]
 ; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
-; X86-AVX512-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
-; X86-AVX512-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
-; X86-AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
+; X86-AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
 ; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -1712,15 +1712,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use:
 ; X64-AVX1:       ## %bb.0:
 ; X64-AVX1-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
-; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
-; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
-; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
-; X64-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
+; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
+; X64-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
 ; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
-; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
-; X64-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
-; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
-; X64-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
+; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
+; X64-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[3]
+; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
+; X64-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[3]
 ; X64-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
 ; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
@@ -1728,16 +1728,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
-; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
-; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
-; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
-; X64-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
+; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
+; X64-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
+; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
+; X64-AVX512-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[3]
+; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
+; X64-AVX512-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[3]
 ; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
-; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
-; X64-AVX512-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
-; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
-; X64-AVX512-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
-; X64-AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
+; X64-AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
 ; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %1 = getelementptr inbounds float, float* %fb, i64 %index
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index d280580..96bcaa1 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -4591,14 +4591,14 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
-; AVX512VLBW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-- 
2.7.4