Extends lowerShuffleAsLanePermuteAndPermute to search for opportunities to use vpermq (64-bit cross-lane shuffle) and vpermd (32-bit cross-lane shuffle) to get elements into the correct lane, in addition to the 128-bit full-lane permutes it previously searched for.
This is especially helpful in cross-lane byte shuffles, where the alternative tends to be "vpshufb both lanes separately and blend them with a vpblendvb", which is very expensive, especially on Haswell where vpblendvb uses the same execution port as all the shuffles.
Addresses PR47262
Patch By: @TellowKrinkle (TellowKrinkle)
Differential Revision: https://reviews.llvm.org/D86429
int NumElts = VT.getVectorNumElements();
int NumLanes = VT.getSizeInBits() / 128;
int NumEltsPerLane = NumElts / NumLanes;
+ bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
+
+ /// Attempts to find a sublane permute with the given size
+ /// that gets all elements into their target lanes.
+ ///
+ /// If successful, fills CrossLaneMask and InLaneMask and returns true.
+ /// If unsuccessful, returns false and may overwrite InLaneMask.
+ auto getSublanePermute = [&](int NumSublanes) -> SDValue {
+ int NumSublanesPerLane = NumSublanes / NumLanes;
+ int NumEltsPerSublane = NumElts / NumSublanes;
+
+ SmallVector<int, 16> CrossLaneMask;
+ SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
+ // CrossLaneMask but one entry == one sublane.
+ SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
- SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
- SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
-
- for (int i = 0; i != NumElts; ++i) {
- int M = Mask[i];
- if (M < 0)
- continue;
+ for (int i = 0; i != NumElts; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
- // Ensure that each lane comes from a single source lane.
- int SrcLane = M / NumEltsPerLane;
- int DstLane = i / NumEltsPerLane;
- if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
- return SDValue();
- SrcLaneMask[DstLane] = SrcLane;
+ int SrcSublane = M / NumEltsPerSublane;
+ int DstLane = i / NumEltsPerLane;
- PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
- }
+ // We only need to get the elements into the right lane, not sublane.
+ // So search all sublanes that make up the destination lane.
+ bool Found = false;
+ int DstSubStart = DstLane * NumSublanesPerLane;
+ int DstSubEnd = DstSubStart + NumSublanesPerLane;
+ for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
+ if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
+ continue;
- // Make sure we set all elements of the lane mask, to avoid undef propagation.
- SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
- for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
- int SrcLane = SrcLaneMask[DstLane];
- if (0 <= SrcLane)
- for (int j = 0; j != NumEltsPerLane; ++j) {
- LaneMask[(DstLane * NumEltsPerLane) + j] =
- (SrcLane * NumEltsPerLane) + j;
+ Found = true;
+ CrossLaneMaskLarge[DstSublane] = SrcSublane;
+ int DstSublaneOffset = DstSublane * NumEltsPerSublane;
+ InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
+ break;
}
- }
+ if (!Found)
+ return SDValue();
+ }
- // If we're only shuffling a single lowest lane and the rest are identity
- // then don't bother.
- // TODO - isShuffleMaskInputInPlace could be extended to something like this.
- int NumIdentityLanes = 0;
- bool OnlyShuffleLowestLane = true;
- for (int i = 0; i != NumLanes; ++i) {
- if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
- i * NumEltsPerLane))
- NumIdentityLanes++;
- else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
- OnlyShuffleLowestLane = false;
- }
- if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
+ // Fill CrossLaneMask using CrossLaneMaskLarge.
+ narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
+
+ if (!CanUseSublanes) {
+ // If we're only shuffling a single lowest lane and the rest are identity
+ // then don't bother.
+ // TODO - isShuffleMaskInputInPlace could be extended to something like
+ // this.
+ int NumIdentityLanes = 0;
+ bool OnlyShuffleLowestLane = true;
+ for (int i = 0; i != NumLanes; ++i) {
+ int LaneOffset = i * NumEltsPerLane;
+ if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
+ i * NumEltsPerLane))
+ NumIdentityLanes++;
+ else if (CrossLaneMask[LaneOffset] != 0)
+ OnlyShuffleLowestLane = false;
+ }
+ if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
+ return SDValue();
+ }
+
+ SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
+ return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
+ InLaneMask);
+ };
+
+ // First attempt a solution with full lanes.
+ if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
+ return V;
+
+ // The rest of the solutions use sublanes.
+ if (!CanUseSublanes)
+ return SDValue();
+
+ // Then attempt a solution with 64-bit sublanes (vpermq).
+ if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
+ return V;
+
+ // If that doesn't work and we have fast variable shuffle,
+ // attempt 32-bit sublanes (vpermd).
+ if (!Subtarget.hasFastVariableShuffle())
return SDValue();
- SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
- return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
+ return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
}
/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
; AVX1-NEXT: vmovdqu %xmm2, 16(%rdi)
; AVX1-NEXT: retq
;
-; AVX2-LABEL: interleave_24i16_in:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqu (%rsi), %xmm0
-; AVX2-NEXT: vmovdqu (%rdx), %xmm1
-; AVX2-NEXT: vmovdqu (%rcx), %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
-; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm4
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
-; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-NEXT: vmovdqu %xmm0, 32(%rdi)
-; AVX2-NEXT: vmovdqu %ymm3, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: interleave_24i16_in:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovdqu (%rsi), %xmm0
+; AVX2-SLOW-NEXT: vmovdqu (%rdx), %xmm1
+; AVX2-SLOW-NEXT: vmovdqu (%rcx), %xmm2
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
+; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm4
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX2-SLOW-NEXT: vmovdqu %xmm0, 32(%rdi)
+; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rdi)
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: interleave_24i16_in:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqu (%rsi), %xmm0
+; AVX2-FAST-NEXT: vmovdqu (%rdx), %xmm1
+; AVX2-FAST-NEXT: vmovdqu (%rcx), %xmm2
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
+; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm4
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6]
+; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX2-FAST-NEXT: vmovdqu %xmm0, 32(%rdi)
+; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rdi)
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; XOP-LABEL: interleave_24i16_in:
; XOP: # %bb.0:
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17]
-; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
;
; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; XOPAVX2-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17]
-; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0
-; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
;
; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
-; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,5,u,u,0,u,u,u>
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
;
; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,5,u,u,0,u,u,u>
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
;
; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: retq
;
;
; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
;
; AVX2-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: retq
;
;
; XOPAVX2-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
;
; AVX2-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: retq
;
;
; XOPAVX2-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
;
; AVX2-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,8,9,8,9,8,9,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
;
; XOPAVX2-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
-; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,8,9,8,9,8,9,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
;
; AVX2-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,2]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,0,1,2,3,4,5,6,7,4,5,14,15,16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
;
; XOPAVX2-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,2]
-; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,0,1,2,3,4,5,6,7,4,5,14,15,16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 11, i32 8, i32 9, i32 8, i32 9, i32 10, i32 11, i32 10, i32 11>
ret <16 x i16> %shuffle
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
-; AVX2-SLOW-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,u,u]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-FAST-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
; AVX512VL: # %bb.0:
;
; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
-; XOPAVX2-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17]
-; XOPAVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i16> %shuffle
;
; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,u,u,16,17,16,17,16,17,16,17,24,25,24,25,24,25,u,u]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-FAST-NEXT: vpsllq $48, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,6,u,4,6,u,u>
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,4,5,4,5,4,5,8,9,16,17,16,17,16,17,16,17,20,21,20,21,20,21,20,21]
; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
;
; AVX2-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
;
; XOPAVX2-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
-; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
-; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 11, i32 undef, i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11>
ret <16 x i16> %shuffle
;
; AVX2-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,3,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
;
; XOPAVX2-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,3,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 15, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15>
ret <16 x i16> %shuffle
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,u,u,24,25,24,25,24,25,24,25,16,17,16,17,16,17,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,u,u,24,25,24,25,24,25,24,25,16,17,16,17,16,17,u,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,4,u,6,4,u,u>
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,4,5,4,5,4,5,8,9,16,17,16,17,16,17,16,17,20,21,20,21,20,21,20,21]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
; AVX512VL: # %bb.0:
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,u,u,18,19,16,17,26,27,24,25,26,27,24,25,18,19,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,u,u,18,19,16,17,26,27,24,25,26,27,24,25,18,19,u,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u,4,6,u,u>
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,2,3,8,9,18,19,16,17,22,23,20,21,22,23,20,21,18,19,16,17]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
; AVX512VL: # %bb.0:
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,u,u,26,27,24,25,18,19,16,17,26,27,24,25,18,19,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,u,u,26,27,24,25,18,19,16,17,26,27,24,25,18,19,u,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,4,u,6,4,u,u>
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,2,3,0,1,6,7,8,9,18,19,16,17,22,23,20,21,18,19,16,17,22,23,20,21]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
; AVX512VL: # %bb.0:
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,u,u,26,27,24,25,18,19,16,17,18,19,16,17,26,27,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpsllq $48, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,u,u,26,27,24,25,18,19,16,17,18,19,16,17,26,27,u,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vpsllq $48, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u>
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,2,3,8,9,18,19,16,17,22,23,20,21,22,23,20,21,18,19,16,17]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
; AVX512VL: # %bb.0:
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,u,u,16,17,24,25,24,25,16,17,16,17,24,25,24,25,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,u,u,16,17,24,25,24,25,16,17,16,17,24,25,24,25,u,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u,4,6,u,u>
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,4,5,4,5,8,9,16,17,20,21,20,21,16,17,16,17,20,21,20,21,16,17]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
; AVX512VL: # %bb.0:
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,u,u,24,25,16,17,16,17,24,25,24,25,16,17,16,17,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpsllq $48, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,u,u,24,25,16,17,16,17,24,25,24,25,16,17,16,17,u,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vpsllq $48, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u>
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,4,5,4,5,8,9,16,17,20,21,20,21,16,17,16,17,20,21,20,21,16,17]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
; AVX512VL: # %bb.0:
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,0,5,7,6,4,5]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,2,3,6,7,10,11,0,1,4,5,14,15,16,17,16,17,18,19,22,23,26,27,16,17,20,21,30,31]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
; AVX512VL: # %bb.0:
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,u,u,16,17,16,17,24,25,24,25,24,25,24,25,24,25,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpsllq $48, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,u,u,16,17,16,17,24,25,24,25,24,25,24,25,24,25,u,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vpsllq $48, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,6,u,4,6,u,u>
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,4,5,4,5,4,5,8,9,16,17,16,17,20,21,20,21,20,21,20,21,20,21,20,21]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
; AVX512VL: # %bb.0:
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,u,u,24,25,24,25,16,17,16,17,24,25,24,25,24,25,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpsllq $48, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,u,u,24,25,24,25,16,17,16,17,24,25,24,25,24,25,u,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vpsllq $48, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u>
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,0,1,0,1,0,1,8,9,16,17,16,17,20,21,20,21,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
; AVX512VL: # %bb.0:
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,u,u,16,17,24,25,24,25,16,17,24,25,24,25,24,25,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpsllq $48, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,u,u,16,17,24,25,24,25,16,17,24,25,24,25,24,25,u,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vpsllq $48, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,6,u,4,6,u,u>
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,4,5,4,5,4,5,8,9,16,17,20,21,20,21,16,17,20,21,20,21,20,21,20,21]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
; AVX512VL: # %bb.0:
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,u,u,16,17,24,25,24,25,16,17,16,17,16,17,16,17,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,u,u,16,17,24,25,24,25,16,17,16,17,16,17,16,17,u,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u,4,6,u,u>
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,0,1,0,1,8,9,16,17,20,21,20,21,16,17,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
; AVX512VL: # %bb.0:
;
; AVX2-FAST-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,u,u,16,17,24,25,24,25,16,17,24,25,26,27,28,29,u,u]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,3,7,4,6,7,u>
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,4,5,6,7,8,9,14,15,16,17,20,21,20,21,16,17,20,21,22,23,24,25,26,27]
; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,u,u,16,17,18,19,24,25,24,25,24,25,24,25,24,25,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpsllq $48, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,u,u,16,17,18,19,24,25,24,25,24,25,24,25,24,25,u,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vpsllq $48, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,6,u,4,6,u,u>
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,4,5,4,5,4,5,4,5,4,5,8,9,16,17,u,u,20,21,20,21,20,21,20,21,20,21,20,21]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
; AVX512VL: # %bb.0:
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,u,u,24,25,24,25,24,25,16,17,24,25,24,25,24,25,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpsllq $48, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,u,u,24,25,24,25,24,25,16,17,24,25,24,25,24,25,u,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vpsllq $48, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u>
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,u,u,4,5,0,1,0,1,0,1,8,9,16,17,16,17,u,u,20,21,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
; AVX512VL: # %bb.0:
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,u,u,16,17,24,25,24,25,16,17,24,25,24,25,24,25,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpsllq $48, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,u,u,16,17,24,25,24,25,16,17,24,25,24,25,24,25,u,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vpsllq $48, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u>
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,0,1,4,5,0,1,0,1,0,1,8,9,u,u,16,17,16,17,20,21,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
; AVX512VL: # %bb.0:
;
; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,2]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,2]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,8,9,10,11,0,1,2,3,4,5,14,15,16,17,18,19,24,25,26,27,16,17,18,19,20,21,30,31]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,4,5,6,11,u,u,u,u,12,13,14,11>
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,8,9,4,5,6,11,12,13,8,9,12,13,14,11]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
;
; XOPAVX2-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,2]
-; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,2]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,8,9,10,11,0,1,2,3,4,5,14,15,16,17,18,19,24,25,26,27,16,17,18,19,20,21,30,31]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 11>
ret <16 x i16> %shuffle
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,0,6,5,7,4,6]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,8,9,0,1,6,7,2,3,14,15,18,19,22,23,26,27,24,25,16,17,22,23,18,19,30,31]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
; AVX512VL: # %bb.0:
;
; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27:
;
; XOPAVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3]
-; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15]
-; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
+; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 27, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
ret <16 x i16> %shuffle
; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31:
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31:
; XOPAVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; XOPAVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
+; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 31, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
ret <16 x i16> %shuffle
;
; AVX2-FAST-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,2,3,12,13,12,13,u,u,u,u,16,17,16,17,20,21,18,19,28,29,28,29,u,u,u,u]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,3,7,u,4,7,u,u>
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,10,11,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,28,29,u,u,30,31,u,u]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-FAST-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,0,1,u,u,2,3,u,u,24,25,u,u,26,27,u,u,16,17,u,u,18,19]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,0,4,u,6,4,u,u>
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,10,11,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,28,29,u,u,30,31,u,u]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-FAST-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7,8,9],ymm2[10],ymm1[11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,0,1,u,u,u,u,6,7,12,13,u,u,u,u,18,19,16,17,u,u,u,u,22,23,20,21]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7,8,9],ymm2[10],ymm1[11,12,13,14,15]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,0,1,u,u,u,u,6,7,4,5,u,u,u,u,18,19,16,17,u,u,u,u,22,23,20,21]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,u,u,u,u,6,7,4,5,u,u,u,u,18,19,16,17,u,u,u,u,22,23,20,21,u,u,u,u]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,0,1,u,u,u,u,6,7,12,13,u,u,u,u,18,19,16,17,u,u,u,u,22,23,20,21]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-FAST-NEXT: retq
;
;
; XOPAVX2-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3]
-; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7,8,9],ymm2[10],ymm1[11,12,13,14,15]
-; XOPAVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,0,1,u,u,u,u,6,7,12,13,u,u,u,u,18,19,16,17,u,u,u,u,22,23,20,21]
; XOPAVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
-; XOPAVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7]
+; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 0, i32 17, i32 16, i32 3, i32 2, i32 19, i32 26, i32 9, i32 8, i32 25, i32 24, i32 11, i32 10, i32 27, i32 26>
ret <16 x i16> %shuffle
;
; AVX2-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11:
;
; XOPAVX2-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
-; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
-; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
+; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 0, i32 17, i32 1, i32 18, i32 2, i32 19, i32 11, i32 24, i32 8, i32 25, i32 9, i32 26, i32 10, i32 27, i32 11>
ret <16 x i16> %shuffle
; AVX2-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15:
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,3,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
; AVX2-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; XOPAVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,3,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 20, i32 4, i32 21, i32 5, i32 22, i32 6, i32 23, i32 15, i32 28, i32 12, i32 29, i32 13, i32 30, i32 14, i32 31, i32 15>
;
; AVX2-FAST-LABEL: PR24935:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm1[8,9],zero,zero,zero,zero,ymm1[14,15,12,13,0,1,24,25,24,25],zero,zero,ymm1[24,25,16,17,30,31,28,29,16,17]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5],zero,zero,ymm1[10,11,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,2,3,2,3,u,u,10,11,u,u,u,u,u,u,u,u,18,19,18,19,u,u,26,27,u,u,u,u,u,u]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255]
-; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,5,u,u,0,4,6,2>
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[6,7],zero,zero,ymm0[18,19,22,23],zero,zero,zero,zero,ymm0[26,27,28,29,16,17],zero,zero
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <5,6,3,0,0,6,4,u>
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,0,1],zero,zero,ymm1[6,7,0,1,10,11],zero,zero,ymm1[12,13],zero,zero,zero,zero,ymm1[16,17,20,21],zero,zero,zero,zero,zero,zero,ymm1[24,25]
+; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: PR24935:
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VLBW-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16]
-; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512VLBW-NEXT: movl $-2147450880, %eax # imm = 0x80008000
-; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
;
; XOPAVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastb %xmm0, %ymm1
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; XOPAVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16]
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VLBW-NEXT: movl $1, %eax
-; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
;
; XOPAVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,9,u,u,u,u,u,u,0,u,u,u,u,u,u,u>
-; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
;
; XOPAVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
-; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,9,u,u,u,u,u,u,0,u,u,u,u,u,u,u>
-; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
;
; XOPAVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
-; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,5,u,u,0,u,u,u>
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-FAST-NEXT: retq
-;
-; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VLBW-SLOW: # %bb.0:
-; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-SLOW-NEXT: retq
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: retq
;
-; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VLBW-FAST: # %bb.0:
-; AVX512VLBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,5,u,u,0,u,u,u>
-; AVX512VLBW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-FAST-NEXT: retq
+; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
;
; XOPAVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,5,u,u,0,u,u,u>
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-FAST-NEXT: retq
-;
-; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VLBW-SLOW: # %bb.0:
-; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-SLOW-NEXT: retq
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: retq
;
-; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VLBW-FAST: # %bb.0:
-; AVX512VLBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,5,u,u,0,u,u,u>
-; AVX512VLBW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-FAST-NEXT: retq
+; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
;
; XOPAVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,5,u,u,0,u,u,u>
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-FAST-NEXT: retq
-;
-; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VLBW-SLOW: # %bb.0:
-; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-SLOW-NEXT: retq
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: retq
;
-; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VLBW-FAST: # %bb.0:
-; AVX512VLBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,5,u,u,0,u,u,u>
-; AVX512VLBW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-FAST-NEXT: retq
+; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
;
; XOPAVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 22, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,5,u,u,0,u,u,u>
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-FAST-NEXT: retq
-;
-; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VLBW-SLOW: # %bb.0:
-; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-SLOW-NEXT: retq
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: retq
;
-; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VLBW-FAST: # %bb.0:
-; AVX512VLBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,5,u,u,0,u,u,u>
-; AVX512VLBW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-FAST-NEXT: retq
+; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
;
; XOPAVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: retq
;
;
; XOPAVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: retq
;
;
; XOPAVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: retq
;
;
; XOPAVX2-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 26, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: retq
;
;
; XOPAVX2-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: retq
;
;
; XOPAVX2-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 28, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
;
; AVX2-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: retq
;
;
; XOPAVX2-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 29, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
;
; AVX2-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: retq
;
;
; XOPAVX2-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
;
; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,0,0,0]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
-; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm1 = [15,0,0,0]
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
-; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,3]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
;
; XOPAVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,0,0,0]
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
-; XOPAVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[u,u],zero,zero,ymm1[12],zero,ymm1[u,u,u],zero,zero,ymm1[u,0,3,u,u],zero,ymm1[u],zero,zero,ymm1[21,16],zero,ymm1[26],zero,ymm1[u,20,18,20,23]
-; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255,0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[u,u],zero,zero,ymm1[12],zero,ymm1[u,u,u],zero,zero,ymm1[u,0,3,u,u],zero,ymm1[u],zero,zero,ymm1[21,16],zero,ymm1[26],zero,ymm1[u,20,18,20,23]
+; AVX2-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255,0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: retq
;
-; AVX512VLBW-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
-; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
-; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7]
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13,u,u,3,3],zero,ymm1[8,u,u,u,12,1,u],zero,zero,ymm1[u,u,20,u,17,22],zero,zero,ymm1[16],zero,ymm1[27,u],zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23]
-; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512VLBW-NEXT: movl $134948620, %eax # imm = 0x80B270C
-; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1}
-; AVX512VLBW-NEXT: retq
+; AVX2-FAST-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[u,u],zero,zero,ymm1[12],zero,ymm1[u,u,u],zero,zero,ymm1[u,0,3,u,u],zero,ymm1[u],zero,zero,ymm1[21,16],zero,ymm1[26],zero,ymm1[u,20,18,20,23]
+; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <3,4,5,7,5,4,1,u>
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,u,u,5,10,13,u,u,0,u,u,16,23,u,23,u,u,u,u,u,u,u,27,u,u,u,u]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255,0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13,u,u,3,3],zero,ymm1[8,u,u,u,12,1,u],zero,zero,ymm1[u,u,20,u,17,22],zero,zero,ymm1[16],zero,ymm1[27,u],zero,zero,zero,zero
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23]
+; AVX512VLBW-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512VLBW-SLOW-NEXT: movl $134948620, %eax # imm = 0x80B270C
+; AVX512VLBW-SLOW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1}
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <3,4,5,7,5,4,1,u>
+; AVX512VLBW-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13,u,u,3,3],zero,ymm1[8,u,u,u,12,1,u],zero,zero,ymm1[u,u,20,u,17,22],zero,zero,ymm1[16],zero,ymm1[27,u],zero,zero,zero,zero
+; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23]
+; AVX512VLBW-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512VLBW-FAST-NEXT: movl $134948620, %eax # imm = 0x80B270C
+; AVX512VLBW-FAST-NEXT: kmovd %eax, %k1
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[u,u,0,1,u,u,u,u,5,10,13,u,u,0,u,u,16,23,u,23,u,u,u,u,u,u,u,27,u,u,u,u]
+; AVX512VLBW-FAST-NEXT: retq
+
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
; AVX512VLVBMI: # %bb.0:
;
; AVX2-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1],zero,zero,ymm0[2,3],zero,zero,ymm0[4,5],zero,zero,ymm0[6,7],zero,zero,zero,zero,ymm0[24,25],zero,zero,ymm0[26,27],zero,zero,ymm0[28,29],zero,zero,ymm0[30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,1],zero,zero,ymm0[2,3],zero,zero,ymm0[4,5],zero,zero,ymm0[6,7,24,25],zero,zero,ymm0[26,27],zero,zero,ymm0[28,29],zero,zero,ymm0[30,31],zero,zero
-; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31:
;
; XOPAVX2-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1],zero,zero,ymm0[2,3],zero,zero,ymm0[4,5],zero,zero,ymm0[6,7],zero,zero,zero,zero,ymm0[24,25],zero,zero,ymm0[26,27],zero,zero,ymm0[28,29],zero,zero,ymm0[30,31]
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,1],zero,zero,ymm0[2,3],zero,zero,ymm0[4,5],zero,zero,ymm0[6,7,24,25],zero,zero,ymm0[26,27],zero,zero,ymm0[28,29],zero,zero,ymm0[30,31],zero,zero
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 1, i32 16, i32 17, i32 2, i32 3, i32 18, i32 19, i32 4, i32 5, i32 20, i32 21, i32 6, i32 7, i32 22, i32 23, i32 8, i32 9, i32 24, i32 25, i32 10, i32 11, i32 26, i32 27, i32 12, i32 13, i32 28, i32 29, i32 14, i32 15, i32 30, i32 31>
ret <32 x i8> %shuffle
;
; AVX2-LABEL: PR47262:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4],zero,zero,ymm0[1,5],zero,zero,ymm0[2,6],zero,zero,ymm0[3,7],zero,zero,zero,zero,ymm0[24,28],zero,zero,ymm0[25,29],zero,zero,ymm0[26,30],zero,zero,ymm0[27,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,4],zero,zero,ymm0[1,5],zero,zero,ymm0[2,6],zero,zero,ymm0[3,7,24,28],zero,zero,ymm0[25,29],zero,zero,ymm0[26,30],zero,zero,ymm0[27,31],zero,zero
-; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: PR47262:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,u,u,1,5,u,u,2,6,u,u,3,7,u,u,u,u,24,28,u,u,25,29,u,u,26,30,u,u,27,31]
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,4,u,u,1,5,u,u,2,6,u,u,3,7,24,28,u,u,25,29,u,u,26,30,u,u,27,31,u,u]
-; AVX512VLBW-NEXT: movw $21930, %ax # imm = 0x55AA
-; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: PR47262:
;
; XOPAVX2-LABEL: PR47262:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4],zero,zero,ymm0[1,5],zero,zero,ymm0[2,6],zero,zero,ymm0[3,7],zero,zero,zero,zero,ymm0[24,28],zero,zero,ymm0[25,29],zero,zero,ymm0[26,30],zero,zero,ymm0[27,31]
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,4],zero,zero,ymm0[1,5],zero,zero,ymm0[2,6],zero,zero,ymm0[3,7,24,28],zero,zero,ymm0[25,29],zero,zero,ymm0[26,30],zero,zero,ymm0[27,31],zero,zero
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
; XOPAVX2-NEXT: retq
%t1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
%t2 = bitcast <4 x i64> %t1 to <32 x i8>
; KNL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
; KNL: ## %bb.0:
; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; KNL-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; KNL-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6],ymm3[7],ymm2[8,9,10,11],ymm3[12,13],ymm2[14],ymm3[15]
-; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,u,u]
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm4
-; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7],ymm0[8,9,10,11,12],ymm4[13,14,15]
-; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17,u,u]
+; KNL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,1,2,3]
+; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,6,7,u,u,12,13,u,u,2,3,u,u,0,1,u,u,22,23,u,u,20,21,u,u,18,19,u,u,u,u]
+; KNL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3]
+; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,u,u,4,5,u,u,2,3,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17,u,u]
; KNL-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7],ymm0[8],ymm3[9],ymm0[10],ymm3[11],ymm0[12],ymm3[13],ymm0[14],ymm3[15]
; KNL-NEXT: vextracti32x4 $3, %zmm1, %xmm1
; KNL-NEXT: vpbroadcastw %xmm1, %ymm1
; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7],ymm3[8,9,10,11,12,13,14],ymm1[15]
; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17]
+; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,6,7,u,u,12,13,u,u,2,3,u,u,0,1,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17]
; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
define <32 x i8> @PR27320(<8 x i32> %a0) {
; CHECK-LABEL: PR27320:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
-; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,28,29,29,30,31,16,16,17,18,19,19,20,21,22,22,23]
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,20,21,21,22,23,24,24,25,26,27,27,28,29,30,30,31]
; CHECK-NEXT: ret{{[l|q]}}
%1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 3, i32 4, i32 5, i32 undef>
%2 = bitcast <8 x i32> %1 to <32 x i8>
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: PR45604:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,0,0,0,0,0,0,0,0]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,255,255,0,0,u,u,u,u,255,255,0,0,u,u,u,u,0,0,255,255,u,u,u,u,0,0,255,255>
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,0,0,11,11,11,11,11,11,11,11]
-; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,5,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,u,u,255,255,0,0,u,u,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,u,u>
-; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
-; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm3, %ymm0
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-SLOW-NEXT: vmovdqu %ymm0, 32(%rdi)
-; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rdi)
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: PR45604:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,255,255,0,0,u,u,u,u,255,255,0,0,u,u,u,u,0,0,255,255,u,u,u,u,0,0,255,255>
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,11,11,11,11,11,11,11,11]
-; AVX2-FAST-NEXT: vpblendvb %ymm0, {{.*}}(%rip), %ymm1, %ymm0
-; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[0,1],zero,zero,ymm1[u,u,u,u,2,3],zero,zero,ymm1[u,u,u,u],zero,zero,ymm1[20,21,u,u,u,u],zero,zero,ymm1[22,23,u,u,u,u]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm3[0,1,u,u,u,u],zero,zero,ymm3[2,3,u,u,u,u,20,21],zero,zero,ymm3[u,u,u,u,22,23],zero,zero,ymm3[u,u,u,u]
-; AVX2-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9],zero,zero,ymm1[u,u,u,u,10,11],zero,zero,ymm1[u,u,u,u],zero,zero,ymm1[28,29,u,u,u,u],zero,zero,ymm1[30,31,u,u,u,u]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[8,9,u,u,u,u],zero,zero,ymm3[10,11,u,u,u,u,28,29],zero,zero,ymm3[u,u,u,u,30,31],zero,zero,ymm3[u,u,u,u]
-; AVX2-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-FAST-NEXT: vmovdqu %ymm0, 32(%rdi)
-; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rdi)
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: PR45604:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rsi), %xmm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u>
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0>
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
+; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi)
+; AVX2-NEXT: vmovdqu %ymm1, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
%v1 = load <8 x i16>, <8 x i16>* %src, align 16
%v2 = shufflevector <8 x i16> %v1, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%v3 = shufflevector <16 x i16> %v2, <16 x i16> <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>