SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
+ SDValue InOp0 = N->getOperand(0);
+ EVT InVT = InOp0.getValueType();
EVT OutVT = N->getValueType(0);
EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
SDLoc dl(N);
SDValue BaseIdx = N->getOperand(1);
- SDValue InOp0 = N->getOperand(0);
- if (getTypeAction(InOp0.getValueType()) == TargetLowering::TypePromoteInteger)
- InOp0 = GetPromotedInteger(N->getOperand(0));
-
- EVT InVT = InOp0.getValueType();
-
SmallVector<SDValue, 8> Ops;
Ops.reserve(OutNumElems);
for (unsigned i = 0; i != OutNumElems; ++i) {
SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
InVT.getVectorElementType(), N->getOperand(0), Index);
- SDValue Op = DAG.getAnyExtOrTrunc(Ext, dl, NOutVTElem);
+ SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, Ext);
// Insert the converted element to the new vector.
Ops.push_back(Op);
}
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
- setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
V2 = getZeroVector(VT, Subtarget, DAG, DL);
switch (VT.SimpleTy) {
+ case MVT::v2f64:
+ case MVT::v4f32:
+ case MVT::v4f64:
+ case MVT::v8f32:
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8));
case MVT::v4i64:
case MVT::v8i32:
assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
LLVM_FALLTHROUGH;
- case MVT::v4f64:
- case MVT::v8f32:
- assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
- LLVM_FALLTHROUGH;
- case MVT::v2f64:
case MVT::v2i64:
- case MVT::v4f32:
case MVT::v4i32:
- case MVT::v8i16:
- assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
- return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
- DAG.getConstant(BlendMask, DL, MVT::i8));
+ // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
+ // that instruction.
+ if (Subtarget.hasAVX2()) {
+ // Scale the blend by the number of 32-bit dwords per element.
+ int Scale = VT.getScalarSizeInBits() / 32;
+ BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
+ MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
+ V1 = DAG.getBitcast(BlendVT, V1);
+ V2 = DAG.getBitcast(BlendVT, V2);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8)));
+ }
+ LLVM_FALLTHROUGH;
+ case MVT::v8i16: {
+ // For integer shuffles we need to expand the mask and cast the inputs to
+ // v8i16s prior to blending.
+ int Scale = 8 / VT.getVectorNumElements();
+ BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
+ V1 = DAG.getBitcast(MVT::v8i16, V1);
+ V2 = DAG.getBitcast(MVT::v8i16, V2);
+ return DAG.getBitcast(VT,
+ DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8)));
+ }
case MVT::v16i16: {
- assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
+ assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
SmallVector<int, 8> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
}
LLVM_FALLTHROUGH;
}
- case MVT::v32i8:
- assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
- LLVM_FALLTHROUGH;
- case MVT::v16i8: {
- assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
+ case MVT::v16i8:
+ case MVT::v32i8: {
+ assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
+ "256-bit byte-blends require AVX2 support!");
// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
return true;
}
} else {
+ // Determine a type compatible with X86ISD::BLENDI.
+ ShuffleVT = MaskVT;
+ if (Subtarget.hasAVX2()) {
+ if (ShuffleVT == MVT::v4i64)
+ ShuffleVT = MVT::v8i32;
+ else if (ShuffleVT == MVT::v2i64)
+ ShuffleVT = MVT::v4i32;
+ } else {
+ if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
+ ShuffleVT = MVT::v8i16;
+ else if (ShuffleVT == MVT::v4i64)
+ ShuffleVT = MVT::v4f64;
+ else if (ShuffleVT == MVT::v8i32)
+ ShuffleVT = MVT::v8f32;
+ }
+
+ if (!ShuffleVT.isFloatingPoint()) {
+ int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
+ BlendMask =
+ scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
+ ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
+ ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
+ }
+
V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
PermuteImm = (unsigned)BlendMask;
Shuffle = X86ISD::BLENDI;
- ShuffleVT = MaskVT;
return true;
}
}
return SDValue();
}
- case X86ISD::BLENDI: {
- SDValue N0 = N.getOperand(0);
- SDValue N1 = N.getOperand(1);
-
- // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
- // TODO: Handle MVT::v16i16 repeated blend mask.
- if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
- N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
- MVT SrcVT = N0.getOperand(0).getSimpleValueType();
- if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
- SrcVT.getScalarSizeInBits() >= 32) {
- unsigned Mask = N.getConstantOperandVal(2);
- unsigned Size = VT.getVectorNumElements();
- unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
- unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale);
- return DAG.getBitcast(
- VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
- N1.getOperand(0),
- DAG.getConstant(ScaleMask, DL, MVT::i8)));
- }
- }
- return SDValue();
- }
case X86ISD::PSHUFD:
case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:
return SDValue();
}
-static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG) {
- // Disabling for widening legalization for now. We can enable if we find a
- // case that needs it. Otherwise it can be deleted when we switch to
- // widening legalization.
- if (ExperimentalVectorWideningLegalization)
- return SDValue();
-
- EVT VT = N->getValueType(0);
- SDValue In = N->getOperand(0);
-
- // Combine (ext_invec (ext_invec X)) -> (ext_invec X)
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (In.getOpcode() == N->getOpcode() &&
- TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType()))
- return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0));
-
- return SDValue();
-}
-
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
- case ISD::ANY_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG);
case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
VEX_4V, VEX_L, VEX_WIG;
}
-// Emulate vXi32/vXi64 blends with vXf32/vXf64.
-// ExecutionDomainFixPass will cleanup domains later on.
-let Predicates = [HasAVX] in {
-def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
- (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
- (VBLENDPDYrmi VR256:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
- (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
-
-def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
- (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
- (VBLENDPDrmi VR128:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
- (VBLENDPDrmi VR128:$src1, addr:$src2, (BlendCommuteImm2 imm:$src3))>;
-}
-
-let Predicates = [HasAVX1Only] in {
-def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3),
- (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3),
- (VBLENDPSYrmi VR256:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3),
- (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>;
-
-def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
- (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3),
- (VBLENDPSrmi VR128:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3),
- (VBLENDPSrmi VR128:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
-}
-
defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
VR128, memop, f128mem, 1, SSEPackedSingle,
SchedWriteFBlend.XMM, BlendCommuteImm4>;
VR128, memop, i128mem, 1, SSEPackedInt,
SchedWriteBlend.XMM, BlendCommuteImm8>;
-let Predicates = [UseSSE41] in {
-def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
- (BLENDPDrri VR128:$src1, VR128:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3),
- (BLENDPDrmi VR128:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3),
- (BLENDPDrmi VR128:$src1, addr:$src2, (BlendCommuteImm2 imm:$src3))>;
-
-def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
- (BLENDPSrri VR128:$src1, VR128:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3),
- (BLENDPSrmi VR128:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3),
- (BLENDPSrmi VR128:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
-}
-
// For insertion into the zero index (low half) of a 256-bit vector, it is
// more efficient to generate a blend with immediate instead of an insert*128.
let Predicates = [HasAVX] in {
(VBLENDPSYrri VR256:$src1,
(INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
VR128:$src2, sub_xmm), 0xf)>;
-
-def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
- (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
-def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
- (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
}
/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
(VPBLENDDYrri VR256:$src1,
(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
VR128:$src2, sub_xmm), 0xf)>;
-
-def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
- (VPBLENDDYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
-def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
- (VPBLENDDYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
-def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
- (VPBLENDDYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
-def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
- (VPBLENDDYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
}
let Predicates = [HasAVX1Only] in {
(VBLENDPSYrri VR256:$src1,
(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
VR128:$src2, sub_xmm), 0xf)>;
-
-def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
- (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
-def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
- (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
-def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
- (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
-def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
- (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
}
//===----------------------------------------------------------------------===//
define void @foo8(float* nocapture %RET) nounwind {
; CHECK-LABEL: foo8:
; CHECK: ## %bb.0: ## %allocas
-; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.0E+2,2.0E+0,1.0E+2,4.0E+0]
-; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1.0E+2,6.0E+0,1.0E+2,8.0E+0]
+; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT: cvtdq2ps %xmm0, %xmm0
+; CHECK-NEXT: cvtdq2ps %xmm1, %xmm1
; CHECK-NEXT: movups %xmm1, 16(%rdi)
; CHECK-NEXT: movups %xmm0, (%rdi)
; CHECK-NEXT: retq
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64
; Insertion/shuffles of all-zero/all-bits/constants into v8i32->v8f32 sitofp conversion.
define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp) {
; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps 32(%rdi), %xmm0
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: vmovaps 32(%rdi), %xmm1
+; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa 32(%rdi), %xmm2
-; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT: vmovdqa 32(%rdi), %xmm3
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0]
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa 32(%rdi), %xmm1
-; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: vmovdqa 32(%rdi), %xmm2
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0]
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2
; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,2305843009213693952]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
; AVX1-NEXT: vpsrlq $62, %xmm5, %xmm5
; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm5
-; AVX1-NEXT: vpsrlq $2, %xmm5, %xmm5
+; AVX1-NEXT: vpsrlq $2, %xmm5, %xmm6
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854775808,2305843009213693952]
; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpsubq %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2
; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s
define <8 x i16> @commute_fold_pblendw(<8 x i16> %a, <8 x i16>* %b) {
; CHECK-LABEL: commute_fold_pblendw:
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32SSE --check-prefix=X32SSE2
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64SSE --check-prefix=X64SSE2
-; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32SSE --check-prefix=X32SSE4
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64SSE --check-prefix=X64SSE4
-; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32AVX --check-prefix=X32AVX1
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64AVX --check-prefix=X64AVX1
-; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32AVX --check-prefix=X32AVX2
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64AVX --check-prefix=X64AVX2
-; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32AVX --check-prefix=X32AVX512F
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X64AVX --check-prefix=X64AVX512F
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32SSE --check-prefix=X32SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64SSE --check-prefix=X64SSE2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32SSE --check-prefix=X32SSE4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64SSE --check-prefix=X64SSE4
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32AVX --check-prefix=X32AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64AVX --check-prefix=X64AVX1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32AVX --check-prefix=X32AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64AVX --check-prefix=X64AVX2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32AVX --check-prefix=X32AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X64AVX --check-prefix=X64AVX512F
define <16 x i8> @elt0_v16i8(i8 %x) {
; X32SSE2-LABEL: elt0_v16i8:
; AVX2-LABEL: insert_v16i16_x12345x789ABCDEx:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10,11,12,13],ymm1[14],ymm0[15]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7,8,9,10,11,12,13],ymm1[14],ymm2[15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: retq
; AVX512-LABEL: insert_v16i16_x12345x789ABCDEx:
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10,11,12,13],ymm1[14],ymm0[15]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7,8,9,10,11,12,13],ymm1[14],ymm2[15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: retq
; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: vpsrlq $32, %xmm0, %xmm0
-; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,0,1,0]
+; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,32768,0,0,1,0,0,0]
; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: vpsrlq $61, %xmm0, %xmm0
-; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [4,0,8,0]
+; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [4,0,0,0,8,0,0,0]
; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; X32-NEXT: vpsllq $20, %xmm0, %xmm0
; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: vpsrlq $61, %xmm0, %xmm0
-; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [4,0,8,0]
+; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [4,0,0,0,8,0,0,0]
; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; X32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: vpsrlq $60, %xmm0, %xmm2
; X32-NEXT: vpsrlq $61, %xmm0, %xmm0
; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,8,0]
+; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,0,0,8,0,0,0]
; X32-NEXT: vpxor %xmm2, %xmm0, %xmm0
; X32-NEXT: vpsubq %xmm2, %xmm0, %xmm0
; X32-NEXT: vpmovsxdq %xmm1, %xmm1
; X32-NEXT: vpmovsxdq 8(%ebp), %xmm4
; X32-NEXT: vextractf128 $1, %ymm2, %xmm5
; X32-NEXT: vpsrlq $33, %xmm5, %xmm5
-; X32-NEXT: vmovdqa {{.*#+}} xmm6 = [1073741824,0,1,0]
+; X32-NEXT: vmovdqa {{.*#+}} xmm6 = [0,16384,0,0,1,0,0,0]
; X32-NEXT: vpxor %xmm6, %xmm5, %xmm5
; X32-NEXT: vpsubq %xmm6, %xmm5, %xmm5
; X32-NEXT: vpsrlq $33, %xmm2, %xmm2
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW
define <1 x double> @load_v1f64_v1i64(<1 x i64> %trigger, <1 x double>* %addr, <1 x double> %dst) {
; SSE-LABEL: load_v1f64_v1i64:
; SSE42-LABEL: load_v2f32_v2i32:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm2, %xmm2
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE42-NEXT: pcmpeqq %xmm2, %xmm0
-; SSE42-NEXT: pextrb $0, %xmm0, %eax
+; SSE42-NEXT: movdqa %xmm0, %xmm3
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE42-NEXT: pcmpeqq %xmm2, %xmm3
+; SSE42-NEXT: pextrb $0, %xmm3, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB10_2
; SSE42-NEXT: ## %bb.1: ## %cond.load
-; SSE42-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3,4,5,6,7]
; SSE42-NEXT: LBB10_2: ## %else
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE42-NEXT: pcmpeqq %xmm2, %xmm0
; SSE42-NEXT: pextrb $8, %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB10_4
; SSE42-LABEL: load_v2i32_v2i32:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm2, %xmm2
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE42-NEXT: pcmpeqq %xmm2, %xmm0
-; SSE42-NEXT: pextrb $0, %xmm0, %eax
+; SSE42-NEXT: movdqa %xmm0, %xmm3
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE42-NEXT: pcmpeqq %xmm2, %xmm3
+; SSE42-NEXT: pextrb $0, %xmm3, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB11_2
; SSE42-NEXT: ## %bb.1: ## %cond.load
; SSE42-NEXT: movl (%rdi), %eax
; SSE42-NEXT: pinsrq $0, %rax, %xmm1
; SSE42-NEXT: LBB11_2: ## %else
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE42-NEXT: pcmpeqq %xmm2, %xmm0
; SSE42-NEXT: pextrb $8, %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB11_4
; SSE42-LABEL: load_undef_v2f32_v2i32:
; SSE42: ## %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm1
-; SSE42-NEXT: pxor %xmm0, %xmm0
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; SSE42-NEXT: pcmpeqq %xmm0, %xmm1
-; SSE42-NEXT: pextrb $0, %xmm1, %eax
+; SSE42-NEXT: pxor %xmm2, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE42-NEXT: pcmpeqq %xmm2, %xmm0
+; SSE42-NEXT: pextrb $0, %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: ## implicit-def: $xmm0
; SSE42-NEXT: je LBB12_2
; SSE42-NEXT: ## %bb.1: ## %cond.load
; SSE42-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE42-NEXT: LBB12_2: ## %else
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE42-NEXT: pcmpeqq %xmm2, %xmm1
; SSE42-NEXT: pextrb $8, %xmm1, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB12_4
; SSE4-LABEL: store_v2f32_v2i32:
; SSE4: ## %bb.0:
; SSE4-NEXT: pxor %xmm2, %xmm2
-; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE4-NEXT: pcmpeqq %xmm2, %xmm0
-; SSE4-NEXT: pextrb $0, %xmm0, %eax
+; SSE4-NEXT: movdqa %xmm0, %xmm3
+; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE4-NEXT: pcmpeqq %xmm2, %xmm3
+; SSE4-NEXT: pextrb $0, %xmm3, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: je LBB3_2
; SSE4-NEXT: ## %bb.1: ## %cond.store
; SSE4-NEXT: movss %xmm1, (%rdi)
; SSE4-NEXT: LBB3_2: ## %else
+; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE4-NEXT: pcmpeqq %xmm2, %xmm0
; SSE4-NEXT: pextrb $8, %xmm0, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: je LBB3_4
; SSE4-LABEL: store_v2i32_v2i32:
; SSE4: ## %bb.0:
; SSE4-NEXT: pxor %xmm2, %xmm2
-; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE4-NEXT: pcmpeqq %xmm2, %xmm0
-; SSE4-NEXT: pextrb $0, %xmm0, %eax
+; SSE4-NEXT: movdqa %xmm0, %xmm3
+; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE4-NEXT: pcmpeqq %xmm2, %xmm3
+; SSE4-NEXT: pextrb $0, %xmm3, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: je LBB4_2
; SSE4-NEXT: ## %bb.1: ## %cond.store
; SSE4-NEXT: movss %xmm1, (%rdi)
; SSE4-NEXT: LBB4_2: ## %else
+; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE4-NEXT: pcmpeqq %xmm2, %xmm0
; SSE4-NEXT: pextrb $8, %xmm0, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: je LBB4_4
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512F
;
; Just one 32-bit run to make sure we do reasonable things.
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32-AVX
; SSE42-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
; SSE42-NEXT: movdqa %xmm0, %xmm4
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6],xmm1[7]
-; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
+; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm3[6,7]
; SSE42-NEXT: movdqa %xmm2, %xmm3
; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5],xmm3[6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13]
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
; AVX1: # %bb.0:
; AVX1-NEXT: vmovupd (%rsi), %ymm0
; AVX1-NEXT: vmovupd (%rcx), %ymm1
-; AVX1-NEXT: vmovups (%rdx), %xmm2
-; AVX1-NEXT: vmovups 16(%rdx), %xmm3
-; AVX1-NEXT: vmovups (%rsi), %xmm4
-; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm2[2,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2]
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm4[0,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1]
+; AVX1-NEXT: vmovups 16(%rcx), %xmm2
+; AVX1-NEXT: vmovups (%rdx), %xmm3
+; AVX1-NEXT: vmovups 16(%rdx), %xmm4
+; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm2[3,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,1],xmm5[0,2]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,2]
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
-; AVX1-NEXT: vmovups 16(%rcx), %xmm4
-; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,0],xmm4[3,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,1],xmm5[0,2]
-; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
+; AVX1-NEXT: vmovups (%rsi), %xmm4
+; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm3[2,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1],xmm5[0,2]
+; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX1-NEXT: vmovups %ymm3, 64(%rdi)
-; AVX1-NEXT: vmovups %ymm2, (%rdi)
+; AVX1-NEXT: vmovups %ymm3, (%rdi)
+; AVX1-NEXT: vmovups %ymm2, 64(%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; XOP: # %bb.0:
; XOP-NEXT: vmovupd (%rsi), %ymm0
; XOP-NEXT: vmovups (%rcx), %ymm1
-; XOP-NEXT: vmovups (%rdx), %xmm2
-; XOP-NEXT: vmovups 16(%rdx), %xmm3
-; XOP-NEXT: vmovups (%rsi), %xmm4
-; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm2[2,0]
-; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2]
-; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm4[0,0]
-; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1]
+; XOP-NEXT: vmovups 16(%rcx), %xmm2
+; XOP-NEXT: vmovups (%rdx), %xmm3
+; XOP-NEXT: vmovups 16(%rdx), %xmm4
+; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm2[3,0]
+; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,1],xmm5[0,2]
+; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,0]
+; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,2]
; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; XOP-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0]
-; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
-; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
-; XOP-NEXT: vmovups 16(%rcx), %xmm4
-; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,0],xmm4[3,0]
-; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,1],xmm5[0,2]
-; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,0]
-; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,2]
-; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
; XOP-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3]
; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
+; XOP-NEXT: vmovups (%rsi), %xmm4
+; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm3[2,0]
+; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1],xmm5[0,2]
+; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0]
+; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1]
+; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; XOP-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0]
+; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm1[2],ymm0[3],ymm1[2,3],ymm0[4],ymm1[5,4],ymm0[5]
; XOP-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7]
; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; XOP-NEXT: vmovups %ymm0, 32(%rdi)
-; XOP-NEXT: vmovups %ymm3, 64(%rdi)
-; XOP-NEXT: vmovups %ymm2, (%rdi)
+; XOP-NEXT: vmovups %ymm3, (%rdi)
+; XOP-NEXT: vmovups %ymm2, 64(%rdi)
; XOP-NEXT: vzeroupper
; XOP-NEXT: retq
%s1 = load <8 x i32>, <8 x i32>* %q1, align 4
;
; X86-AVX1-LABEL: trunc_ashr_v4i64_demandedelts:
; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X86-AVX1-NEXT: vpsllq $63, %xmm1, %xmm2
-; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; X86-AVX1-NEXT: vpsllq $63, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpsllq $63, %xmm0, %xmm1
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X86-AVX1-NEXT: vpsllq $63, %xmm2, %xmm3
+; X86-AVX1-NEXT: vpsrlq $63, %xmm3, %xmm3
+; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,0,0,0,0,0,32768]
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2
; X86-AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,0,2147483648]
-; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; X86-AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
-; X86-AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
-; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
-; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; X86-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
;
; X64-AVX1-LABEL: trunc_ashr_v4i64_demandedelts:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X64-AVX1-NEXT: vpsllq $63, %xmm1, %xmm2
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; X64-AVX1-NEXT: vpsllq $63, %xmm0, %xmm2
-; X64-AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1
+; X64-AVX1-NEXT: vpsllq $63, %xmm0, %xmm1
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT: vpsllq $63, %xmm2, %xmm3
+; X64-AVX1-NEXT: vpsrlq $63, %xmm3, %xmm3
+; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9223372036854775808]
-; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; X64-AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
-; X64-AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1
+; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; X64-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; X64-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovaps 80(%rbp), %ymm13
; CHECK-NEXT: vmovaps 48(%rbp), %ymm14
; CHECK-NEXT: vmovaps 16(%rbp), %ymm15
-; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
; CHECK-NEXT: vmovaps %xmm9, %xmm6
+; CHECK-NEXT: vmovdqa %xmm6, %xmm9
+; CHECK-NEXT: # kill: def $ymm9 killed $xmm9
; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: # implicit-def: $ymm0
; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,0]
-; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7]
; CHECK-NEXT: vmovaps %xmm2, %xmm6
; CHECK-NEXT: # implicit-def: $ymm2
; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
; CHECK-NEXT: vmovq {{.*#+}} xmm6 = xmm6[0],zero
; CHECK-NEXT: # implicit-def: $ymm11
; CHECK-NEXT: vmovaps %xmm6, %xmm11
-; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
; CHECK-NEXT: vmovaps %xmm7, %xmm6
; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7]
; CHECK-NEXT: # implicit-def: $ymm11
; CHECK-NEXT: vmovaps %xmm6, %xmm11
; CHECK-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
; CHECK-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,3]
-; CHECK-NEXT: vblendpd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3]
-; CHECK-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7]
; CHECK-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,1,3]
; CHECK-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,1,4,5,4,5]
-; CHECK-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7]
; CHECK-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vmovaps %ymm5, %ymm1
; CHECK-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; X64-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X64-SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; X64-SKX-NEXT: vpsrld $7, %xmm0, %xmm0
-; X64-SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X64-SKX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X64-SKX-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp)
; X64-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -o - -mcpu=btver2 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
-
-define <8 x i32> @_Z6test70v(<4 x i32>* %id14793) {
-; CHECK-LABEL: _Z6test70v:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vmovaps (%rdi), %xmm0
-; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2,3]
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,1,0]
-; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,0]
-; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
-entry:
- %id14793.0.id14793.0. = load <4 x i32>, <4 x i32>* %id14793, align 16
- %shuffle = shufflevector <4 x i32> %id14793.0.id14793.0., <4 x i32> <i32 undef, i32 -1052558824, i32 undef, i32 undef>, <8 x i32> <i32 0, i32 2, i32 5, i32 0, i32 1, i32 3, i32 1, i32 0>
- ret <8 x i32> %shuffle
-}
; AVX256VLBW: # %bb.0:
; AVX256VLBW-NEXT: vptestnmb %ymm0, %ymm0, %k0
; AVX256VLBW-NEXT: vpmovm2b %k0, %ymm0
-; AVX256VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
-; AVX256VLBW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,1,5,5,6,5]
-; AVX256VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX256VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX256VLBW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX256VLBW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,2,1,5,5,6,5]
+; AVX256VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX256VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
; AVX256VLBW-NEXT: movl $537141252, %eax # imm = 0x20042004
; AVX256VLBW-NEXT: kmovd %eax, %k1
-; AVX256VLBW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
+; AVX256VLBW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1}
; AVX256VLBW-NEXT: vpmovb2m %ymm0, %k0
; AVX256VLBW-NEXT: vpmovm2b %k0, %ymm0
; AVX256VLBW-NEXT: retq
; X64-AVX512-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
; X64-AVX512-NEXT: andq %rax, %rcx
; X64-AVX512-NEXT: vmovq %rcx, %xmm1
-; X64-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X64-AVX512-NEXT: retq
%bc = bitcast <4 x i32> %i to i128
%insert = and i128 %bc, -4294967296
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: psubd %xmm0, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
-; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: PR39893:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpsubd %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: retq
;
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vpsubd %xmm0, %xmm2, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512-NEXT: retq
%sub = sub <2 x i32> <i32 0, i32 undef>, %x
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
; SSE41-NEXT: psrad $31, %xmm3
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movd %xmm3, %eax
; SSE41-NEXT: retq
; SSE41-NEXT: pxor %xmm0, %xmm2
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movd %xmm1, %eax
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
; SSE41-NEXT: pxor %xmm0, %xmm2
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
; SSE41-NEXT: psrad $31, %xmm3
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movd %xmm3, %eax
; SSE41-NEXT: retq
; SSE41-NEXT: psrad $16, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movd %xmm1, %eax
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
; SSE41-NEXT: psrad $24, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; X32-AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2
; X32-AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1
; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,0]
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,0,2,0,0,0]
; X32-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; X32-AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1
; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2
; X32-AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1073741824,0,16777216]
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,16384,0,0,0,256]
; X32-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; X32-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2: # %bb.0:
; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vpsraw $1, %ymm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7,8],ymm0[9],ymm2[10,11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpmulhw {{\.LCPI.*}}, %ymm0, %ymm1
; X32-AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; X32-AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; X32-AVX2-NEXT: vpsraw $1, %ymm0, %ymm0
; X32-AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7,8],ymm0[9],ymm2[10,11,12,13,14,15]
; X32-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_0213cedf:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7]
; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX512VL-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_0213cedf:
;
; SSE41-LABEL: shuffle_v8i16_XX4X8acX:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
+; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,4,5]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v8i16_XX4X8acX:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,4,5]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17]
-; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-SLOW-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
; AVX1-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,0,1,4,5,8,9,4,5>
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16]
-; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512VLBW-NEXT: movl $-2147450880, %eax # imm = 0x80008000
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
-; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
-; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
-; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
-; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
-; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
-; AVX512VLBW-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
-; AVX512VLBW-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
-; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512VLBW-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
-; AVX512VLBW-NEXT: movl $15, %eax
-; AVX512VLBW-NEXT: vmovd %eax, %xmm1
-; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512VLBW-NEXT: retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW: # %bb.0:
+; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT: movl $15, %eax
+; AVX512VLBW-SLOW-NEXT: vmovd %eax, %xmm1
+; AVX512VLBW-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX512VLBW-SLOW-NEXT: retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST: # %bb.0:
+; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT: movl $15, %eax
+; AVX512VLBW-FAST-NEXT: vmovd %eax, %xmm1
+; AVX512VLBW-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX512VLBW-FAST-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0: