This reverts commits
f819e4c7d0f6efef3cc1042cc45582320bf6c0a2 and
35c0848b570214ed2b2d96cca4dd62bb7ae725cd. It triggers an infinite loop during
compilation.
$ cat t.ll
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
define void @MaxPoolGradGrad_1.65() local_unnamed_addr #0 {
entry:
%wide.vec78 = load <64 x i32>, <64 x i32>* null, align 16
%strided.vec83 = shufflevector <64 x i32> %wide.vec78, <64 x i32> poison, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60>
%0 = lshr <8 x i32> %strided.vec83, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%1 = add <8 x i32> zeroinitializer, %0
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%3 = shufflevector <16 x i32> %2, <16 x i32> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%interleaved.vec = shufflevector <32 x i32> undef, <32 x i32> %3, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
store <64 x i32> %interleaved.vec, <64 x i32>* undef, align 16
unreachable
}
$ llc < t.ll -mcpu=skylake
<hang>
(RootVT.isFloatingPoint() && Depth >= 1) ||
(RootVT.is256BitVector() && !Subtarget.hasAVX2());
- // How many elements does each of the inputs have, given the current
- // granularity of the root shuffle? Note that while currently the sizes of an
- // inputs must match the size of the shuffle root,
- // that restriction will be lifted in the future.
- SmallVector<unsigned, 2> InputNumElts;
- llvm::transform(std::initializer_list<MVT>({VT1, VT2}),
- std::back_inserter(InputNumElts),
- [BaseMaskEltSizeInBits](MVT VT) {
- assert(VT.getSizeInBits() % BaseMaskEltSizeInBits == 0 &&
- "Input is not a multiple of output element width?");
- return VT.getSizeInBits() / BaseMaskEltSizeInBits;
- });
-
// Don't combine if we are a AVX512/EVEX target and the mask element size
// is different from the root element size - this would prevent writemasks
// from being reused.
// If we are shuffling a broadcast (and not introducing zeros) then
// we can just use the broadcast directly. This works for smaller broadcast
// elements as well as they already repeat across each mask element
- SmallVector<bool, 2> InputIsSplat;
- llvm::transform(
- std::initializer_list<SDValue>({V1, V2}),
- std::back_inserter(InputIsSplat), [BaseMaskEltSizeInBits](SDValue V) {
- return isTargetShuffleSplat(V) &&
- (BaseMaskEltSizeInBits % V.getScalarValueSizeInBits()) == 0;
- });
- if (UnaryShuffle && InputIsSplat[0] && !isAnyZero(BaseMask) &&
+ if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
+ (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
V1.getValueSizeInBits() >= RootSizeInBits) {
return CanonicalizeShuffleInput(RootVT, V1);
}
- SmallVector<int, 64> Mask(BaseMask.begin(), BaseMask.end());
-
- // Adjust mask elements that pick from a splat input to be identity mask elts,
- // i.e. to pick from the same lane of the input as the mask element is in.
- // This may allow to simplify the shuffle into a blend.
- if (InputIsSplat[0] || InputIsSplat[1]) {
- for (unsigned i = 0; i != NumBaseMaskElts; ++i) {
- int &M = Mask[i];
- assert(isUndefOrZeroOrInRange(M, 0, 2 * NumBaseMaskElts) &&
- "OOB mask element?");
- if (M < 0)
- continue; // Keep the undef/zero mask elements as-is.
- int InputIdx = (unsigned)M < NumBaseMaskElts ? 0 : 1;
- // Is the used input wide-enough to contain that lane, and is it a splat?
- if (InputIsSplat[InputIdx] && i < InputNumElts[InputIdx])
- M = i + InputIdx * NumBaseMaskElts; // Pick from the same lane of input.
- }
- }
-
// See if the shuffle is a hidden identity shuffle - repeated args in HOPs
// etc. can be simplified.
if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {
SmallVector<int> ScaledMask, IdentityMask;
unsigned NumElts = VT1.getVectorNumElements();
- if (Mask.size() <= NumElts &&
- scaleShuffleElements(Mask, NumElts, ScaledMask)) {
+ if (BaseMask.size() <= NumElts &&
+ scaleShuffleElements(BaseMask, NumElts, ScaledMask)) {
for (unsigned i = 0; i != NumElts; ++i)
IdentityMask.push_back(i);
if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
// If the upper subvectors are zeroable, then an extract+insert is more
// optimal than using X86ISD::SHUF128. The insertion is free, even if it has
// to zero the upper subvectors.
- if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
+ if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
return SDValue(); // Nothing to do!
- assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
+ assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
"Unexpected lane shuffle");
Res = CanonicalizeShuffleInput(RootVT, V1);
- unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
- bool UseZero = isAnyZero(Mask);
+ unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts);
+ bool UseZero = isAnyZero(BaseMask);
Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
}
// Narrow shuffle mask to v4x128.
SmallVector<int, 4> ScaledMask;
assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
- narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
+ narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, ScaledMask);
// Try to lower to vshuf64x2/vshuf32x4.
auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
// If the upper half is zeroable, then an extract+insert is more optimal
// than using X86ISD::VPERM2X128. The insertion is free, even if it has to
// zero the upper half.
- if (isUndefOrZero(Mask[1])) {
+ if (isUndefOrZero(BaseMask[1])) {
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
return SDValue(); // Nothing to do!
- assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
+ assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
Res = CanonicalizeShuffleInput(RootVT, V1);
- Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
- return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
- 256);
+ Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL);
+ return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
+ DL, 256);
}
// If we're splatting the low subvector, an insert-subvector 'concat'
// pattern is quicker than VPERM2X128.
// TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
- if (Mask[0] == 0 && Mask[1] == 0 && !Subtarget.hasAVX2()) {
+ if (BaseMask[0] == 0 && BaseMask[1] == 0 && !Subtarget.hasAVX2()) {
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
return SDValue(); // Nothing to do!
Res = CanonicalizeShuffleInput(RootVT, V1);
// we need to use the zeroing feature.
// Prefer blends for sequential shuffles unless we are optimizing for size.
if (UnaryShuffle &&
- !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
- (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
+ !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
+ (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
unsigned PermMask = 0;
- PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
- PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
+ PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
+ PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
return DAG.getNode(
X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
// TODO - handle AVX512VL cases with X86ISD::SHUF128.
if (!UnaryShuffle && !IsMaskedShuffle) {
- assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
+ assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&
"Unexpected shuffle sentinel value");
// Prefer blends to X86ISD::VPERM2X128.
- if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
+ if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
+ (BaseMask[0] == 2 && BaseMask[1] == 1))) {
unsigned PermMask = 0;
- PermMask |= ((Mask[0] & 3) << 0);
- PermMask |= ((Mask[1] & 3) << 4);
- SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
- SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
+ PermMask |= ((BaseMask[0] & 3) << 0);
+ PermMask |= ((BaseMask[1] & 3) << 4);
+ SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
+ SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
CanonicalizeShuffleInput(RootVT, LHS),
CanonicalizeShuffleInput(RootVT, RHS),
// For masks that have been widened to 128-bit elements or more,
// narrow back down to 64-bit elements.
+ SmallVector<int, 64> Mask;
if (BaseMaskEltSizeInBits > 64) {
assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
int MaskScale = BaseMaskEltSizeInBits / 64;
- SmallVector<int, 64> ScaledMask;
- narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
- Mask = std::move(ScaledMask);
+ narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
+ } else {
+ Mask.assign(BaseMask.begin(), BaseMask.end());
}
// For masked shuffles, we're trying to match the root width for better
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4
-; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
-; X32-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
+; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; X32-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
-; X32-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
+; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
+; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
; X32-NEXT: vaddps %xmm2, %xmm1, %xmm1
; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
; X64-LABEL: insertps_from_broadcast_multiple_use:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4
-; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
-; X64-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
+; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; X64-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
-; X64-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
+; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
+; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
; X64-NEXT: vaddps %xmm2, %xmm1, %xmm1
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
; CHECK-FAST: # %bb.0:
; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
-; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,5,2,7]
+; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,4,2,6]
; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3
; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
; CHECK-FAST: # %bb.0:
; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
-; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,5,2,7]
+; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,6]
; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}
define <8 x i32> @shiftInput___canonical(<8 x i32> %input, i32 %shiftval, <8 x i32> %__mask) nounwind {
; CHECK-LABEL: shiftInput___canonical:
; CHECK: # %bb.0: # %allocas
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT: vpsrld %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpsrld %xmm2, %xmm0, %xmm0
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm1
+; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpsrld %xmm2, %xmm3, %xmm4
+; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm5
+; CHECK-NEXT: vpsrld %xmm5, %xmm3, %xmm6
+; CHECK-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
+; CHECK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; CHECK-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm6[2,3,4,5,6,7]
+; CHECK-NEXT: vpsrld %xmm6, %xmm3, %xmm7
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; CHECK-NEXT: vpsrld %xmm1, %xmm3, %xmm3
+; CHECK-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; CHECK-NEXT: vpsrld %xmm2, %xmm0, %xmm2
+; CHECK-NEXT: vpsrld %xmm5, %xmm0, %xmm4
+; CHECK-NEXT: vpsrld %xmm6, %xmm0, %xmm5
+; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; CHECK-NEXT: retl
allocas:
%smear.0 = insertelement <8 x i32> undef, i32 %shiftval, i32 0
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
; X86-AVX1-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
-; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
-; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
-; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
-; X86-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
+; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
+; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]
+; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
+; X86-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]
; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
-; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
-; X86-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[3]
-; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
-; X86-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[3]
+; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
+; X86-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
+; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
+; X86-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
; X86-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
; X86-AVX1-NEXT: retl ## encoding: [0xc3]
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
; X86-AVX512-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
-; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
-; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
-; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
-; X86-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
-; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
-; X86-AVX512-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[3]
-; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
-; X86-AVX512-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[3]
+; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
+; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]
+; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
+; X86-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]
; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
-; X86-AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
+; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
+; X86-AVX512-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
+; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
+; X86-AVX512-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
+; X86-AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
; X86-AVX512-NEXT: retl ## encoding: [0xc3]
;
; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use:
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
-; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
-; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
-; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
-; X64-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
+; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
+; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]
+; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
+; X64-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]
; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
-; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
-; X64-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[3]
-; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
-; X64-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[3]
+; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
+; X64-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
+; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
+; X64-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
; X64-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
; X64-AVX1-NEXT: retq ## encoding: [0xc3]
; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
-; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
-; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
-; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
-; X64-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
-; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
-; X64-AVX512-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[3]
-; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
-; X64-AVX512-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[3]
+; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
+; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]
+; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
+; X64-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]
; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
-; X64-AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
+; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
+; X64-AVX512-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
+; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
+; X64-AVX512-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
+; X64-AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
; X64-AVX512-NEXT: retq ## encoding: [0xc3]
%1 = getelementptr inbounds float, float* %fb, i64 %index
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
-; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: