* Functions with the probe-stack attribute set to "inline-asm" are now protected
against stack clash without the need of a third-party probing function and
with limited impact on performance.
+* -x86-enable-old-knl-abi command line switch has been removed. v32i16/v64i8
+ vectors are always passed in ZMM register when avx512f is enabled and avx512bw
+ is disabled.
+* Vectors larger than 512 bits with i16 or i8 elements will be passed in
+ multiple ZMM registers when avx512f is enabled. Previously this required
+ avx512bw otherwise they would split into multiple YMM registers. This means
+ vXi16/vXi8 vectors are consistently treated the same as
+ vXi32/vXi64/vXf64/vXf32 vectors of the same total width.
Changes to the AMDGPU Target
-----------------------------
}
switch (N->getOpcode()) {
+ case X86ISD::VBROADCAST: {
+ MVT VT = N->getSimpleValueType(0);
+ // Emulate v32i16/v64i8 broadcast without BWI.
+ if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) {
+ MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8;
+ SDLoc dl(N);
+ SDValue NarrowBCast =
+ CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
+ SDValue Res =
+ CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
+ NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
+ unsigned Index = VT == MVT::v32i16 ? 16 : 32;
+ Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
+ CurDAG->getIntPtrConstant(Index, dl));
+
+ --I;
+ CurDAG->ReplaceAllUsesWith(N, Res.getNode());
+ ++I;
+ CurDAG->DeleteNode(N);
+ }
+
+ break;
+ }
+ case X86ISD::VBROADCAST_LOAD: {
+ MVT VT = N->getSimpleValueType(0);
+ // Emulate v32i16/v64i8 broadcast without BWI.
+ if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) {
+ MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8;
+ auto *MemNode = cast<MemSDNode>(N);
+ SDLoc dl(N);
+ SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
+ SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
+ SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
+ MemNode->getMemOperand());
+ SDValue Res =
+ CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
+ NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
+ unsigned Index = VT == MVT::v32i16 ? 16 : 32;
+ Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
+ CurDAG->getIntPtrConstant(Index, dl));
+
+ --I;
+ SDValue To[] = {Res, NarrowBCast.getValue(1)};
+ CurDAG->ReplaceAllUsesWith(N, To);
+ ++I;
+ CurDAG->DeleteNode(N);
+ }
+
+ break;
+ }
case ISD::VSELECT: {
// Replace VSELECT with non-mask conditions with with BLENDV.
if (N->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
" of the loop header PC will be 0)."),
cl::Hidden);
-// Added in 10.0.
-static cl::opt<bool> EnableOldKNLABI(
- "x86-enable-old-knl-abi", cl::init(false),
- cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of "
- "one ZMM register on AVX512F, but not AVX512BW targets."),
- cl::Hidden);
-
static cl::opt<bool> MulConstantOptimization(
"mul-constant-optimization", cl::init(true),
cl::desc("Replace 'mul x, Const' with more effective instructions like "
// elements. 512-bits can be disabled based on prefer-vector-width and
// required-vector-width function attributes.
if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
+ bool HasBWI = Subtarget.hasBWI();
+
addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
+ addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
+ addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
- // Need to custom widen this if we don't have AVX512BW.
- setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
-
for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
setOperationAction(ISD::FROUND, VT, Custom);
-
- setOperationAction(ISD::SELECT, VT, Custom);
}
- // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
- for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {
+ for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
}
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
+ setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
+
+ setOperationAction(ISD::MUL, MVT::v8i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v16i32, Legal);
+ setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::MUL, MVT::v64i8, Custom);
+
+ setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
+ setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
- setOperationAction(ISD::MUL, MVT::v8i64, Custom);
- setOperationAction(ISD::MUL, MVT::v16i32, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
- setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
- setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
+ for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ // The condition codes aren't legal in SSE/AVX and under AVX512 we use
+ // setcc all the way to isel and prefer SETGT in some isel patterns.
+ setCondCodeAction(ISD::SETLT, VT, Custom);
+ setCondCodeAction(ISD::SETLE, VT, Custom);
+ }
for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::SMIN, VT, Legal);
setOperationAction(ISD::UMIN, VT, Legal);
setOperationAction(ISD::ABS, VT, Legal);
- setOperationAction(ISD::SRL, VT, Custom);
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
- setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
- setOperationAction(ISD::SELECT, VT, Custom);
+ }
- // The condition codes aren't legal in SSE/AVX and under AVX512 we use
- // setcc all the way to isel and prefer SETGT in some isel patterns.
- setCondCodeAction(ISD::SETLT, VT, Custom);
- setCondCodeAction(ISD::SETLE, VT, Custom);
+ for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
+ setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
+ setOperationAction(ISD::CTLZ, VT, Custom);
+ setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
}
if (Subtarget.hasDQI()) {
MVT::v8f32, MVT::v4f64 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+ for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
+ MVT::v16f32, MVT::v8f64 }) {
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ }
+
for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
}
if (!Subtarget.hasBWI()) {
- // Need to custom split v32i16/v64i8 bitcasts.
- setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
- setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
-
- // Better to split these into two 256-bit ops.
- setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom);
- setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v32i16, Custom);
+ setOperationAction(ISD::STORE, MVT::v64i8, Custom);
}
if (Subtarget.hasVBMI2()) {
// disabled based on prefer-vector-width and required-vector-width function
// attributes.
if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
- addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
- addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
-
// Extends from v64i1 masks to 512-bit vectors.
setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
- setOperationAction(ISD::MUL, MVT::v32i16, Legal);
- setOperationAction(ISD::MUL, MVT::v64i8, Custom);
- setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
- setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
- setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
- setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
- setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
-
- setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
- setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
-
setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Custom);
- setOperationAction(ISD::ABS, VT, Legal);
- setOperationAction(ISD::SRL, VT, Custom);
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
- setOperationAction(ISD::CTPOP, VT, Custom);
- setOperationAction(ISD::CTLZ, VT, Custom);
- setOperationAction(ISD::SMAX, VT, Legal);
- setOperationAction(ISD::UMAX, VT, Legal);
- setOperationAction(ISD::SMIN, VT, Legal);
- setOperationAction(ISD::UMIN, VT, Legal);
- setOperationAction(ISD::SETCC, VT, Custom);
- setOperationAction(ISD::UADDSAT, VT, Legal);
- setOperationAction(ISD::SADDSAT, VT, Legal);
- setOperationAction(ISD::USUBSAT, VT, Legal);
- setOperationAction(ISD::SSUBSAT, VT, Legal);
- setOperationAction(ISD::SELECT, VT, Custom);
-
- // The condition codes aren't legal in SSE/AVX and under AVX512 we use
- // setcc all the way to isel and prefer SETGT in some isel patterns.
- setCondCodeAction(ISD::SETLT, VT, Custom);
- setCondCodeAction(ISD::SETLE, VT, Custom);
}
for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
}
- if (Subtarget.hasBITALG()) {
- for (auto VT : { MVT::v64i8, MVT::v32i16 })
- setOperationAction(ISD::CTPOP, VT, Legal);
- }
-
if (Subtarget.hasVBMI2()) {
setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
TargetLoweringBase::LegalizeTypeAction
X86TargetLowering::getPreferredVectorAction(MVT VT) const {
- if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
+ if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
+ !Subtarget.hasBWI())
return TypeSplitVector;
if (VT.getVectorNumElements() != 1 &&
return RegisterVT;
}
- // FIXME: Should we just make these types legal and custom split operations?
- if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
- Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
- return MVT::v16i32;
-
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
return NumRegisters;
}
- // FIXME: Should we just make these types legal and custom split operations?
- if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
- Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
- return 1;
-
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}
Subtarget, DAG))
return Broadcast;
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+
// Dispatch to each element type for lowering. If we don't have support for
// specific element type shuffles at 512 bits, immediately split them and
// lower them. Each lowering routine of a given type is allowed to assume that
unsigned EltSize = VT.getScalarSizeInBits();
unsigned NumElts = VT.getVectorNumElements();
+ // Expand v32i16/v64i8 without BWI.
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ return SDValue();
+
// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
// into an i1 condition so that we can use the mask-based 512-bit blend
// instructions.
unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
- // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
- if (InVT == MVT::v8i8) {
- if (VT != MVT::v8i64)
- return SDValue();
-
- In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
- MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
- return DAG.getNode(ExtendInVecOpc, dl, VT, In);
+ if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
+ assert(InVT == MVT::v32i8 && "Unexpected VT!");
+ return splitVectorIntUnary(Op, DAG);
}
if (Subtarget.hasInt256())
// vpmovqb/w/d, vpmovdb/w, vpmovwb
if (Subtarget.hasAVX512()) {
+ if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
+ assert(VT == MVT::v32i8 && "Unexpected VT!");
+ return splitVectorIntUnary(Op, DAG);
+ }
+
// word to byte only under BWI. Otherwise we have to promoted to v16i32
// and then truncate that. But we should only do that if we haven't been
// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
if (VT.is256BitVector() && !Subtarget.hasInt256())
return splitIntVSETCC(Op, DAG);
+ if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ return splitIntVSETCC(Op, DAG);
+
// If this is a SETNE against the signed minimum value, change it to SETGT.
// If this is a SETNE against the signed maximum value, change it to SETLT.
// which will be swapped to SETGT.
InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type");
- // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
- if (InVT == MVT::v8i8) {
- if (VT != MVT::v8i64)
- return SDValue();
-
- In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
- MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
- return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In);
+ if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
+ assert(InVT == MVT::v32i8 && "Unexpected VT!");
+ return splitVectorIntUnary(Op, DAG);
}
if (Subtarget.hasInt256())
// and each half can execute independently. Some cores would split the op into
// halves anyway, so the concat (vinsertf128) is purely an extra op.
MVT StoreVT = StoredVal.getSimpleValueType();
- if (StoreVT.is256BitVector()) {
+ if (StoreVT.is256BitVector() ||
+ ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
+ !Subtarget.hasBWI())) {
SmallVector<SDValue, 4> CatOps;
if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
return splitVectorStore(St, DAG);
return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
Op.getOperand(0), Op.getOperand(1));
+ if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ return splitVectorIntBinary(Op, DAG);
+
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
return SDValue();
}
+ if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ return splitVectorIntBinary(Op, DAG);
+
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
return splitVectorIntUnary(Op, DAG);
}
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ return splitVectorIntUnary(Op, DAG);
+
// Default to expand.
return SDValue();
}
if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
return splitVectorIntBinary(Op, DAG);
+ if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ return splitVectorIntBinary(Op, DAG);
+
SDLoc DL(Op);
unsigned Opcode = Op.getOpcode();
SDValue N0 = Op.getOperand(0);
if (VT.is256BitVector() && !Subtarget.hasInt256())
return splitVectorIntBinary(Op, DAG);
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ return splitVectorIntBinary(Op, DAG);
+
SDValue A = Op.getOperand(0);
SDValue B = Op.getOperand(1);
if (VT.is256BitVector() && !Subtarget.hasInt256())
return splitVectorIntBinary(Op, DAG);
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ return splitVectorIntBinary(Op, DAG);
+
if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
(VT == MVT::v8i32 && Subtarget.hasInt256()) ||
return ArithmeticShiftRight64(ShiftAmt);
if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
- VT == MVT::v64i8) {
+ (Subtarget.hasBWI() && VT == MVT::v64i8)) {
unsigned NumElts = VT.getVectorNumElements();
MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
- (VT == MVT::v16i8 || VT == MVT::v64i8 ||
- (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
+ (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
+ (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
!Subtarget.hasXOP()) {
int NumElts = VT.getVectorNumElements();
SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
if (VT.is256BitVector())
return splitVectorIntBinary(Op, DAG);
+ if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ return splitVectorIntBinary(Op, DAG);
+
return SDValue();
}
return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
}
- // Custom splitting for BWI types when AVX512F is available but BWI isn't.
- if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
- DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
- SDLoc dl(Op);
- SDValue Lo, Hi;
- std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
- MVT CastVT = DstVT.getHalfNumVectorElementsVT();
- Lo = DAG.getBitcast(CastVT, Lo);
- Hi = DAG.getBitcast(CastVT, Hi);
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
- }
-
// Use MOVMSK for vector to scalar conversion to prevent scalarization.
if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
SDValue In = Op.getOperand(0);
SDLoc DL(Op);
- // Split v8i64/v16i32 without BWI so that we can still use the PSHUFB
- // lowering.
- if (VT == MVT::v8i64 || VT == MVT::v16i32) {
- assert(!Subtarget.hasBWI() && "BWI should Expand BITREVERSE");
+ // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
+ if (VT == MVT::v64i8 && !Subtarget.hasBWI())
return splitVectorIntUnary(Op, DAG);
- }
unsigned NumElts = VT.getVectorNumElements();
assert(VT.getScalarType() == MVT::i8 &&
return;
}
- // Custom splitting for BWI types when AVX512F is available but BWI isn't.
- if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
- SrcVT.isVector() && isTypeLegal(SrcVT)) {
- SDValue Lo, Hi;
- std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
- MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
- Lo = DAG.getBitcast(CastVT, Lo);
- Hi = DAG.getBitcast(CastVT, Hi);
- SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
- Results.push_back(Res);
- return;
- }
-
if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
"Unexpected type action!");
if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
return SDValue();
- // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
- // Also allow v2i32 if it will be widened.
+ // Make sure the type is legal or will be widened to a legal type.
+ if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
- if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT))
+
+ // Without BWI, we would need to split v32i16.
+ if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
return SDValue();
SDValue N0 = N->getOperand(0);
{ ISD::SRA, MVT::v2i64, 1 },
{ ISD::SRA, MVT::v4i64, 1 },
{ ISD::SRA, MVT::v8i64, 1 },
+
+ { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
+ { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
+ { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
{ ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
{ ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
{ ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
+ { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
+ { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
+ { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
+ { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
+ { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
+ { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
};
if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
return LT.first * Entry->Cost;
}
+ static const CostTblEntry AVX512BWShiftCostTable[] = {
+ { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
+ { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
+ { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
+
+ { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
+ { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
+ { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
+
+ { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
+ { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
+ { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
+ };
+
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
static const CostTblEntry AVX2UniformCostTable[] = {
// Uniform splats are cheaper for the following instructions.
{ ISD::SHL, MVT::v16i16, 1 }, // psllw.
{ ISD::SRL, MVT::v16i16, 1 }, // psrlw.
{ ISD::SRA, MVT::v16i16, 1 }, // psraw.
+ { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
+ { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
+ { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
};
if (ST->hasAVX2() &&
return LT.first * Entry->Cost;
static const CostTblEntry AVX512BWCostTable[] = {
- { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
- { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
- { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
-
- { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
- { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
- { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
-
- { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
- { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
- { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
-
{ ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
{ ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
{ ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
{ ISD::SRA, MVT::v4i64, 1 },
{ ISD::SRA, MVT::v8i64, 1 },
+ { ISD::MUL, MVT::v64i8, 26 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
{ ISD::SRL, MVT::v4i64, 1 },
};
+ if (ST->hasAVX512()) {
+ if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
+ (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+ // On AVX512, a packed v32i16 shift left by a constant build_vector
+ // is lowered into a vector multiply (vpmullw).
+ return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ }
+
// Look for AVX2 lowering tricks.
if (ST->hasAVX2()) {
if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
static const CostTblEntry AVX2CostTable[] = {
{ ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SHL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
{ ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+ { ISD::SHL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
{ ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
{ ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+ { ISD::SRL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
{ ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v64i8, 48 }, // 2*vpblendvb sequence.
{ ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
+ { ISD::SRA, MVT::v32i16, 20 }, // 2*extend/vpsravd/pack sequence.
{ ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
{ ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
{TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
{TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
{TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
+ {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
{TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
{TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
{TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
{TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
{TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
- {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1} // vpermt2d
+ {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
+
+ // FIXME: This just applies the type legalization cost rules above
+ // assuming these completely split.
+ {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14},
+ {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14},
+ {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42},
+ {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42},
};
if (ST->hasAVX512())
{ ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 },
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7 },// 2*vpmovqd+concat+vpmovdb
+ { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 9 }, // FIXME
+
// v16i1 -> v16i32 - load + broadcast
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
+ { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
+ { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
+
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
{ ISD::SELECT, MVT::v16i32, 1 },
{ ISD::SELECT, MVT::v8f64, 1 },
{ ISD::SELECT, MVT::v16f32, 1 },
+
+ { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
+ { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
+
+ { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
+ { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
};
static const CostTblEntry AVX2CostTbl[] = {
static const CostTblEntry AVX512CostTbl[] = {
{ ISD::BITREVERSE, MVT::v8i64, 36 },
{ ISD::BITREVERSE, MVT::v16i32, 24 },
+ { ISD::BITREVERSE, MVT::v32i16, 10 },
+ { ISD::BITREVERSE, MVT::v64i8, 10 },
{ ISD::CTLZ, MVT::v8i64, 29 },
{ ISD::CTLZ, MVT::v16i32, 35 },
+ { ISD::CTLZ, MVT::v32i16, 28 },
+ { ISD::CTLZ, MVT::v64i8, 18 },
{ ISD::CTPOP, MVT::v8i64, 16 },
{ ISD::CTPOP, MVT::v16i32, 24 },
+ { ISD::CTPOP, MVT::v32i16, 18 },
+ { ISD::CTPOP, MVT::v64i8, 12 },
{ ISD::CTTZ, MVT::v8i64, 20 },
{ ISD::CTTZ, MVT::v16i32, 28 },
+ { ISD::CTTZ, MVT::v32i16, 24 },
+ { ISD::CTTZ, MVT::v64i8, 18 },
{ ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
{ ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
{ ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
{ ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
{ ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
{ ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
+ { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split
{ ISD::FMAXNUM, MVT::f32, 2 },
{ ISD::FMAXNUM, MVT::v4f32, 2 },
{ ISD::FMAXNUM, MVT::v8f32, 2 },
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'smul'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SLM-LABEL: 'smul'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'umul'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SLM-LABEL: 'umul'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = or i16 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = or <8 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = or <16 x i16> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = or <32 x i16> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = or <32 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = or i8 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = or <16 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = or <32 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = or <64 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = or <64 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I1 = or i1 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I1 = or <2 x i1> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I1 = or <4 x i1> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = or i16 undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = or <8 x i16> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = or <16 x i16> undef, undef
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = or <32 x i16> undef, undef
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = or <32 x i16> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = or i8 undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = or <16 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = or <32 x i8> undef, undef
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = or <64 x i8> undef, undef
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = or <64 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I1 = or i1 undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I1 = or <2 x i1> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I1 = or <4 x i1> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = xor i16 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = xor <8 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = xor <16 x i16> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = xor <32 x i16> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = xor <32 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = xor i8 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = xor <16 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = xor <32 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = xor <64 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = xor <64 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I1 = xor i1 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I1 = xor <2 x i1> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I1 = xor <4 x i1> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = xor i16 undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = xor <8 x i16> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = xor <16 x i16> undef, undef
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = xor <32 x i16> undef, undef
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = xor <32 x i16> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = xor i8 undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = xor <16 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = xor <32 x i8> undef, undef
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = xor <64 x i8> undef, undef
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = xor <64 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I1 = xor i1 undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I1 = xor <2 x i1> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I1 = xor <4 x i1> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = and i16 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = and <8 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = and <16 x i16> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = and <32 x i16> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = and <32 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = and i8 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = and <16 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = and <32 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = and <64 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = and <64 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I1 = and i1 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I1 = and <2 x i1> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I1 = and <4 x i1> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = and i16 undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = and <8 x i16> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = and <16 x i16> undef, undef
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = and <32 x i16> undef, undef
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = and <32 x i16> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = and i8 undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = and <16 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = and <32 x i8> undef, undef
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = and <64 x i8> undef, undef
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = and <64 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I1 = and i1 undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I1 = and <2 x i1> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I1 = and <4 x i1> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'var_funnel_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'var_funnel_i16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'var_funnel_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'var_funnel_i8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'splatvar_funnel_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'splatvar_funnel_i16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'splatvar_funnel_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'splatvar_funnel_i8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'constant_funnel_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'constant_funnel_i16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'constant_funnel_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'constant_funnel_i8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'splatconstant_funnel_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'splatconstant_funnel_i16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'splatconstant_funnel_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'splatconstant_funnel_i8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'var_rotate_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'var_rotate_i16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'var_rotate_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'var_rotate_i8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'splatvar_rotate_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'splatvar_rotate_i16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'splatvar_rotate_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'splatvar_rotate_i8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'constant_rotate_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'constant_rotate_i16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'constant_rotate_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'constant_rotate_i8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'splatconstant_rotate_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'splatconstant_rotate_i16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'splatconstant_rotate_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'splatconstant_rotate_i8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'var_funnel_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'var_funnel_i16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'var_funnel_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'var_funnel_i8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'splatvar_funnel_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'splatvar_funnel_i16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'splatvar_funnel_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'splatvar_funnel_i8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'constant_funnel_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'constant_funnel_i16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'constant_funnel_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'constant_funnel_i8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'splatconstant_funnel_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'splatconstant_funnel_i16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'splatconstant_funnel_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'splatconstant_funnel_i8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'var_rotate_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'var_rotate_i16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'var_rotate_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'var_rotate_i8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'splatvar_rotate_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'splatvar_rotate_i16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'splatvar_rotate_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'splatvar_rotate_i8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'constant_rotate_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'constant_rotate_i16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'constant_rotate_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'constant_rotate_i8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'splatconstant_rotate_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'splatconstant_rotate_i16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'splatconstant_rotate_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'splatconstant_rotate_i8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
+; KNL-NEXT: Cost Model: Found an estimated cost of 168 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef)
+; KNL-NEXT: Cost Model: Found an estimated cost of 352 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT: Cost Model: Found an estimated cost of 88 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
+; KNL-NEXT: Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
;
-; KNL-LABEL: 'masked_expandload'
-; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
-;
-; SKX-LABEL: 'masked_expandload'
-; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; AVX512-LABEL: 'masked_expandload'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
;
%V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
%V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
;
-; KNL-LABEL: 'masked_compressstore'
-; KNL-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 223 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
-;
-; SKX-LABEL: 'masked_compressstore'
-; SKX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 119 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 239 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; AVX512-LABEL: 'masked_compressstore'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 119 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 239 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
;
call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SLM-LABEL: 'reduce_i16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SLM-LABEL: 'reduce_i8'
; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'reduce_i16'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
%V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'reduce_i8'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
%V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'reduce_i16'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
%V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'reduce_i8'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
%V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i16'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'reduce_i16'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
%V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'reduce_i8'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
%V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'urem_constpow2'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 16
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = urem <2 x i64> undef, <i64 8, i64 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = urem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = urem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 16
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = urem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = urem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = urem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 16
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = urem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = urem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32i16 = urem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 16
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = urem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = urem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512BW-LABEL: 'urem_constpow2'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = urem <2 x i64> undef, <i64 8, i64 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = urem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = urem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = urem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = urem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = urem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = urem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = urem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = urem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = urem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = urem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'urem_constpow2'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = urem <2 x i64> undef, <i64 8, i64 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = urem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = urem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = urem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = urem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = urem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = urem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = urem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = urem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = urem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = urem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%I64 = urem i64 undef, 16
%V2i64 = urem <2 x i64> undef, <i64 8, i64 16>
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = urem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'urem_uniformconstpow2'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 16
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = urem <2 x i64> undef, <i64 16, i64 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = urem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = urem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 16
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = urem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = urem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = urem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 16
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = urem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = urem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32i16 = urem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 16
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = urem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = urem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = urem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512BW-LABEL: 'urem_uniformconstpow2'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = urem <2 x i64> undef, <i64 16, i64 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = urem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = urem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = urem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = urem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = urem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = urem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = urem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = urem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = urem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = urem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = urem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'urem_uniformconstpow2'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = urem <2 x i64> undef, <i64 16, i64 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = urem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = urem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = urem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = urem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = urem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = urem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = urem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = urem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = urem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = urem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = urem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%I64 = urem i64 undef, 16
%V2i64 = urem <2 x i64> undef, <i64 16, i64 16>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
-; AVX512F-LABEL: 'test_vXi16'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi16'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX512-LABEL: 'test_vXi16'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'test_vXi16'
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
-; AVX512F-LABEL: 'test_vXi8'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi8'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX512-LABEL: 'test_vXi8'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'test_vXi8'
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'test_vXi16'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'test_vXi8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512BW-LABEL: 'test_vXi16'
; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'trunc_vXi16'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512BW-LABEL: 'trunc_vXi16'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'trunc_vXi16'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; BTVER2-LABEL: 'trunc_vXi16'
; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'extract_i16'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_a = extractelement <2 x i16> undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_0 = extractelement <2 x i16> undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_1 = extractelement <2 x i16> undef, i32 1
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_a = extractelement <4 x i16> undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = extractelement <4 x i16> undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_3 = extractelement <4 x i16> undef, i32 3
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512BW-LABEL: 'extract_i16'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_a = extractelement <2 x i16> undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_0 = extractelement <2 x i16> undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_1 = extractelement <2 x i16> undef, i32 1
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_a = extractelement <4 x i16> undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = extractelement <4 x i16> undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_3 = extractelement <4 x i16> undef, i32 3
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'extract_i16'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_a = extractelement <2 x i16> undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_0 = extractelement <2 x i16> undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_1 = extractelement <2 x i16> undef, i32 1
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_a = extractelement <4 x i16> undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = extractelement <4 x i16> undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_3 = extractelement <4 x i16> undef, i32 3
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SLM-LABEL: 'extract_i16'
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_a = extractelement <2 x i16> undef, i32 %arg
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'extract_i8'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = extractelement <2 x i8> undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = extractelement <2 x i8> undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_1 = extractelement <2 x i8> undef, i32 1
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_a = extractelement <4 x i8> undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = extractelement <4 x i8> undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_3 = extractelement <4 x i8> undef, i32 3
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_a = extractelement <8 x i8> undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = extractelement <8 x i8> undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_7 = extractelement <8 x i8> undef, i32 7
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512BW-LABEL: 'extract_i8'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = extractelement <2 x i8> undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = extractelement <2 x i8> undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_1 = extractelement <2 x i8> undef, i32 1
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_a = extractelement <4 x i8> undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = extractelement <4 x i8> undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_3 = extractelement <4 x i8> undef, i32 3
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_a = extractelement <8 x i8> undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = extractelement <8 x i8> undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_7 = extractelement <8 x i8> undef, i32 7
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'extract_i8'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = extractelement <2 x i8> undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = extractelement <2 x i8> undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_1 = extractelement <2 x i8> undef, i32 1
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_a = extractelement <4 x i8> undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = extractelement <4 x i8> undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_3 = extractelement <4 x i8> undef, i32 3
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_a = extractelement <8 x i8> undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = extractelement <8 x i8> undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_7 = extractelement <8 x i8> undef, i32 7
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SLM-LABEL: 'extract_i8'
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = extractelement <2 x i8> undef, i32 %arg
; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'insert_i16'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_a = insertelement <2 x i16> undef, i16 undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_0 = insertelement <2 x i16> undef, i16 undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_1 = insertelement <2 x i16> undef, i16 undef, i32 1
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_a = insertelement <4 x i16> undef, i16 undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = insertelement <4 x i16> undef, i16 undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_3 = insertelement <4 x i16> undef, i16 undef, i32 3
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512BW-LABEL: 'insert_i16'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_a = insertelement <2 x i16> undef, i16 undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_0 = insertelement <2 x i16> undef, i16 undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_1 = insertelement <2 x i16> undef, i16 undef, i32 1
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_a = insertelement <4 x i16> undef, i16 undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = insertelement <4 x i16> undef, i16 undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_3 = insertelement <4 x i16> undef, i16 undef, i32 3
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'insert_i16'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_a = insertelement <2 x i16> undef, i16 undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_0 = insertelement <2 x i16> undef, i16 undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_1 = insertelement <2 x i16> undef, i16 undef, i32 1
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_a = insertelement <4 x i16> undef, i16 undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = insertelement <4 x i16> undef, i16 undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_3 = insertelement <4 x i16> undef, i16 undef, i32 3
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%v2i16_a = insertelement <2 x i16> undef, i16 undef, i32 %arg
%v2i16_0 = insertelement <2 x i16> undef, i16 undef, i32 0
; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'insert_i8'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = insertelement <2 x i8> undef, i8 undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_3 = insertelement <2 x i8> undef, i8 undef, i32 1
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_a = insertelement <4 x i8> undef, i8 undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_3 = insertelement <4 x i8> undef, i8 undef, i32 3
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_a = insertelement <8 x i8> undef, i8 undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_7 = insertelement <8 x i8> undef, i8 undef, i32 7
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512BW-LABEL: 'insert_i8'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = insertelement <2 x i8> undef, i8 undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_3 = insertelement <2 x i8> undef, i8 undef, i32 1
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_a = insertelement <4 x i8> undef, i8 undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_3 = insertelement <4 x i8> undef, i8 undef, i32 3
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_a = insertelement <8 x i8> undef, i8 undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_7 = insertelement <8 x i8> undef, i8 undef, i32 7
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'insert_i8'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = insertelement <2 x i8> undef, i8 undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_3 = insertelement <2 x i8> undef, i8 undef, i32 1
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_a = insertelement <4 x i8> undef, i8 undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_3 = insertelement <4 x i8> undef, i8 undef, i32 3
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_a = insertelement <8 x i8> undef, i8 undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_7 = insertelement <8 x i8> undef, i8 undef, i32 7
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SLM-LABEL: 'insert_i8'
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = insertelement <2 x i8> undef, i8 undef, i32 %arg
define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 %mask) nounwind {
; AVX512F-LABEL: avg_v64i8_mask:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: movl %edi, %ecx
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: shrq $32, %rdi
; AVX512F-NEXT: shrq $48, %rax
; AVX512F-NEXT: shrl $16, %ecx
-; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm4
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-NEXT: vpavgb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1
; AVX512F-NEXT: kmovw %ecx, %k2
; AVX512F-NEXT: kmovw %eax, %k3
; AVX512F-NEXT: kmovw %edi, %k4
-; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z}
-; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z}
-; AVX512F-NEXT: vpmovdb %zmm5, %xmm5
-; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0
-; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
-; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
+; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512F-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v64i8_mask:
; AVX512F-NEXT: shrq $32, %rdi
; AVX512F-NEXT: shrq $48, %rax
; AVX512F-NEXT: shrl $16, %ecx
-; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpavgb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: kmovw %ecx, %k2
; AVX512F-NEXT: kmovw %eax, %k3
; AVX512F-NEXT: kmovw %edi, %k4
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z}
-; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
-; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z}
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
-; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT: vpandq %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v64i8_maskz:
define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src, i32 %mask) nounwind {
; AVX512F-LABEL: avg_v32i16_mask:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: shrl $16, %edi
-; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm4
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-NEXT: vpavgw %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1
; AVX512F-NEXT: kmovw %edi, %k2
-; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0
-; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512F-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i16_mask:
; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: shrl $16, %edi
-; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpavgw %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: kmovw %edi, %k2
-; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpandq %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i16_maskz:
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: avg_v48i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT: vpavgb (%rsi), %xmm0, %xmm0
-; AVX512F-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1
-; AVX512F-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2
-; AVX512F-NEXT: vmovdqu %xmm1, (%rax)
-; AVX512F-NEXT: vmovdqu %xmm0, (%rax)
-; AVX512F-NEXT: vmovdqu %xmm2, (%rax)
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: avg_v48i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2
-; AVX512BW-NEXT: vpavgb (%rsi), %xmm0, %xmm0
-; AVX512BW-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1
-; AVX512BW-NEXT: vmovdqu %xmm1, (%rax)
-; AVX512BW-NEXT: vmovdqu %xmm0, (%rax)
-; AVX512BW-NEXT: vmovdqu %xmm2, (%rax)
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: avg_v48i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2
+; AVX512-NEXT: vpavgb (%rsi), %xmm0, %xmm0
+; AVX512-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1
+; AVX512-NEXT: vmovdqu %xmm1, (%rax)
+; AVX512-NEXT: vmovdqu %xmm0, (%rax)
+; AVX512-NEXT: vmovdqu %xmm2, (%rax)
+; AVX512-NEXT: retq
%1 = load <48 x i8>, <48 x i8>* %a
%2 = load <48 x i8>, <48 x i8>* %b
%3 = zext <48 x i8> %1 to <48 x i32>
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: movq %rsp, %rbp
-; AVX512F-NEXT: andq $-32, %rsp
-; AVX512F-NEXT: subq $32, %rsp
+; AVX512F-NEXT: andq $-64, %rsp
+; AVX512F-NEXT: subq $64, %rsp
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: vmovdqa 240(%rbp), %ymm8
-; AVX512F-NEXT: vmovdqa 208(%rbp), %ymm9
-; AVX512F-NEXT: vmovdqa 176(%rbp), %ymm10
-; AVX512F-NEXT: vmovdqa 144(%rbp), %ymm11
-; AVX512F-NEXT: vmovdqa 112(%rbp), %ymm12
-; AVX512F-NEXT: vmovdqa 80(%rbp), %ymm13
-; AVX512F-NEXT: vmovdqa 48(%rbp), %ymm14
-; AVX512F-NEXT: vmovdqa 16(%rbp), %ymm15
-; AVX512F-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0
-; AVX512F-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1
-; AVX512F-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2
-; AVX512F-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3
-; AVX512F-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4
-; AVX512F-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5
-; AVX512F-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6
+; AVX512F-NEXT: vpavgb 16(%rbp), %ymm0, %ymm8
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vpavgb 48(%rbp), %ymm0, %ymm0
+; AVX512F-NEXT: vpavgb 80(%rbp), %ymm1, %ymm9
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT: vpavgb 112(%rbp), %ymm1, %ymm1
+; AVX512F-NEXT: vpavgb 144(%rbp), %ymm2, %ymm10
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512F-NEXT: vpavgb 176(%rbp), %ymm2, %ymm2
+; AVX512F-NEXT: vpavgb 208(%rbp), %ymm3, %ymm11
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3
+; AVX512F-NEXT: vpavgb 240(%rbp), %ymm3, %ymm3
+; AVX512F-NEXT: vpavgb 272(%rbp), %ymm4, %ymm12
+; AVX512F-NEXT: vextracti64x4 $1, %zmm4, %ymm4
+; AVX512F-NEXT: vpavgb 304(%rbp), %ymm4, %ymm4
+; AVX512F-NEXT: vpavgb 336(%rbp), %ymm5, %ymm13
+; AVX512F-NEXT: vextracti64x4 $1, %zmm5, %ymm5
+; AVX512F-NEXT: vpavgb 368(%rbp), %ymm5, %ymm5
+; AVX512F-NEXT: vpavgb 400(%rbp), %ymm6, %ymm14
+; AVX512F-NEXT: vextracti64x4 $1, %zmm6, %ymm6
+; AVX512F-NEXT: vpavgb 432(%rbp), %ymm6, %ymm6
+; AVX512F-NEXT: vpavgb 464(%rbp), %ymm7, %ymm15
+; AVX512F-NEXT: vextracti64x4 $1, %zmm7, %ymm7
; AVX512F-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7
-; AVX512F-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15
-; AVX512F-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14
-; AVX512F-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13
-; AVX512F-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12
-; AVX512F-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11
-; AVX512F-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10
-; AVX512F-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9
-; AVX512F-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8
-; AVX512F-NEXT: vmovdqa %ymm8, 480(%rdi)
-; AVX512F-NEXT: vmovdqa %ymm9, 448(%rdi)
-; AVX512F-NEXT: vmovdqa %ymm10, 416(%rdi)
-; AVX512F-NEXT: vmovdqa %ymm11, 384(%rdi)
-; AVX512F-NEXT: vmovdqa %ymm12, 352(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm7, 480(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm15, 448(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm6, 416(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm14, 384(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm5, 352(%rdi)
; AVX512F-NEXT: vmovdqa %ymm13, 320(%rdi)
-; AVX512F-NEXT: vmovdqa %ymm14, 288(%rdi)
-; AVX512F-NEXT: vmovdqa %ymm15, 256(%rdi)
-; AVX512F-NEXT: vmovdqa %ymm7, 224(%rdi)
-; AVX512F-NEXT: vmovdqa %ymm6, 192(%rdi)
-; AVX512F-NEXT: vmovdqa %ymm5, 160(%rdi)
-; AVX512F-NEXT: vmovdqa %ymm4, 128(%rdi)
-; AVX512F-NEXT: vmovdqa %ymm3, 96(%rdi)
-; AVX512F-NEXT: vmovdqa %ymm2, 64(%rdi)
-; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdi)
-; AVX512F-NEXT: vmovdqa %ymm0, (%rdi)
+; AVX512F-NEXT: vmovdqa %ymm4, 288(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm12, 256(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm3, 224(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm11, 192(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm2, 160(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm10, 128(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm1, 96(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm9, 64(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm8, (%rdi)
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: vzeroupper
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: not_avg_v16i8_wide_constants:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r13
-; AVX512F-NEXT: pushq %r12
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm0
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
-; AVX512F-NEXT: vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512F-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512F-NEXT: vpextrq $1, %xmm4, %r10
-; AVX512F-NEXT: vmovq %xmm4, %r11
-; AVX512F-NEXT: vpextrq $1, %xmm3, %r14
-; AVX512F-NEXT: vmovq %xmm3, %r13
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512F-NEXT: vpextrq $1, %xmm4, %r12
-; AVX512F-NEXT: vmovq %xmm4, %r9
-; AVX512F-NEXT: vpextrq $1, %xmm3, %rdi
-; AVX512F-NEXT: vmovq %xmm3, %rsi
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512F-NEXT: vpextrq $1, %xmm3, %rcx
-; AVX512F-NEXT: vmovq %xmm3, %rdx
-; AVX512F-NEXT: vmovq %xmm2, %rax
-; AVX512F-NEXT: vpextrq $1, %xmm2, %rbp
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm2
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512F-NEXT: vpextrq $1, %xmm4, %rbx
-; AVX512F-NEXT: leaq -1(%rbp,%rbx), %rbp
-; AVX512F-NEXT: vmovq %xmm4, %rbx
-; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX512F-NEXT: leaq -1(%rax,%rbx), %rax
-; AVX512F-NEXT: vmovq %xmm4, %rbx
-; AVX512F-NEXT: leaq -1(%rdx,%rbx), %rdx
-; AVX512F-NEXT: vpextrq $1, %xmm4, %rbx
-; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512F-NEXT: leaq -1(%rcx,%rbx), %rcx
-; AVX512F-NEXT: vmovq %xmm1, %rbx
-; AVX512F-NEXT: leaq -1(%rsi,%rbx), %rsi
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rbx
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512F-NEXT: leaq -1(%rdi,%rbx), %r8
-; AVX512F-NEXT: vmovq %xmm1, %rbx
-; AVX512F-NEXT: leaq -1(%r9,%rbx), %r15
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rbx
-; AVX512F-NEXT: leaq -1(%r12,%rbx), %r9
-; AVX512F-NEXT: vmovq %xmm3, %rbx
-; AVX512F-NEXT: leaq -1(%r13,%rbx), %r13
-; AVX512F-NEXT: vpextrq $1, %xmm3, %rbx
-; AVX512F-NEXT: leaq -1(%r14,%rbx), %r12
-; AVX512F-NEXT: vmovq %xmm4, %rbx
-; AVX512F-NEXT: leaq -1(%r11,%rbx), %r11
-; AVX512F-NEXT: vpextrq $1, %xmm4, %rbx
-; AVX512F-NEXT: leaq -1(%r10,%rbx), %r14
-; AVX512F-NEXT: vmovq %xmm2, %rbx
-; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512F-NEXT: leaq -1(%rdi,%rbx), %rdi
-; AVX512F-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT: vpextrq $1, %xmm2, %rbx
-; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512F-NEXT: leaq -1(%rdi,%rbx), %rdi
-; AVX512F-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT: vmovq %xmm0, %rbx
-; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
-; AVX512F-NEXT: vmovq %xmm1, %r10
-; AVX512F-NEXT: leaq -1(%rbx,%r10), %rdi
-; AVX512F-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rbx
-; AVX512F-NEXT: vpextrq $1, %xmm1, %r10
-; AVX512F-NEXT: leaq -1(%rbx,%r10), %rdi
-; AVX512F-NEXT: shrq %rax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: shrq %rbp
-; AVX512F-NEXT: vpinsrb $1, %ebp, %xmm0, %xmm0
-; AVX512F-NEXT: shrq %rdx
-; AVX512F-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; AVX512F-NEXT: shrq %rcx
-; AVX512F-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
-; AVX512F-NEXT: shrq %rsi
-; AVX512F-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; AVX512F-NEXT: shrq %r8
-; AVX512F-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0
-; AVX512F-NEXT: shrq %r15
-; AVX512F-NEXT: vpinsrb $6, %r15d, %xmm0, %xmm0
-; AVX512F-NEXT: shrq %r9
-; AVX512F-NEXT: vpinsrb $7, %r9d, %xmm0, %xmm0
-; AVX512F-NEXT: shrq %r13
-; AVX512F-NEXT: vpinsrb $8, %r13d, %xmm0, %xmm0
-; AVX512F-NEXT: shrq %r12
-; AVX512F-NEXT: vpinsrb $9, %r12d, %xmm0, %xmm0
-; AVX512F-NEXT: shrq %r11
-; AVX512F-NEXT: vpinsrb $10, %r11d, %xmm0, %xmm0
-; AVX512F-NEXT: shrq %r14
-; AVX512F-NEXT: vpinsrb $11, %r14d, %xmm0, %xmm0
-; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512F-NEXT: shrq %rax
-; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512F-NEXT: shrq %rax
-; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512F-NEXT: shrq %rax
-; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: shrq %rdi
-; AVX512F-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0
-; AVX512F-NEXT: vmovdqu %xmm0, (%rax)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r13
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
-; AVX512F-NEXT: popq %rbp
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: not_avg_v16i8_wide_constants:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: pushq %r15
-; AVX512BW-NEXT: pushq %r14
-; AVX512BW-NEXT: pushq %r13
-; AVX512BW-NEXT: pushq %r12
-; AVX512BW-NEXT: pushq %rbx
-; AVX512BW-NEXT: subq $24, %rsp
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512BW-NEXT: vmovq %xmm3, %rbx
-; AVX512BW-NEXT: vpextrq $1, %xmm3, %rbp
-; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512BW-NEXT: vmovq %xmm3, %rdi
-; AVX512BW-NEXT: vpextrq $1, %xmm3, %rsi
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512BW-NEXT: vmovq %xmm2, %rdx
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %r15
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT: vmovq %xmm2, %r8
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %r9
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512BW-NEXT: vmovq %xmm2, %r11
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %r10
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512BW-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512BW-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512BW-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512BW-NEXT: vmovq %xmm3, %rcx
-; AVX512BW-NEXT: addq %rbx, %rcx
-; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT: addq %rbp, %rax
-; AVX512BW-NEXT: movq %rax, %rbp
-; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512BW-NEXT: vmovq %xmm3, %r14
-; AVX512BW-NEXT: addq %rdi, %r14
-; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT: addq %rsi, %rax
-; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512BW-NEXT: vmovq %xmm2, %rax
-; AVX512BW-NEXT: addq %rdx, %rax
-; AVX512BW-NEXT: movq %rax, %rdx
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %r12
-; AVX512BW-NEXT: addq %r15, %r12
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT: vmovq %xmm2, %rax
-; AVX512BW-NEXT: addq %r8, %rax
-; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512BW-NEXT: addq %r9, %rax
-; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512BW-NEXT: vmovq %xmm2, %rax
-; AVX512BW-NEXT: addq %r11, %rax
-; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512BW-NEXT: addq %r10, %rax
-; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT: vmovq %xmm2, %r13
-; AVX512BW-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %rbx
-; AVX512BW-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512BW-NEXT: vmovq %xmm0, %r10
-; AVX512BW-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %r9
-; AVX512BW-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512BW-NEXT: vmovq %xmm1, %rax
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, %r8
-; AVX512BW-NEXT: addq %rax, %r8
-; AVX512BW-NEXT: vpextrq $1, %xmm1, %rdi
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512BW-NEXT: addq %rdi, %rsi
-; AVX512BW-NEXT: addq $-1, %rcx
-; AVX512BW-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: movl $0, %r11d
-; AVX512BW-NEXT: adcq $-1, %r11
-; AVX512BW-NEXT: addq $-1, %rbp
-; AVX512BW-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: movl $0, %edi
-; AVX512BW-NEXT: adcq $-1, %rdi
-; AVX512BW-NEXT: addq $-1, %r14
-; AVX512BW-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: movl $0, %r15d
-; AVX512BW-NEXT: adcq $-1, %r15
-; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512BW-NEXT: movl $0, %r14d
-; AVX512BW-NEXT: adcq $-1, %r14
-; AVX512BW-NEXT: addq $-1, %rdx
-; AVX512BW-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: movl $0, %eax
-; AVX512BW-NEXT: adcq $-1, %rax
-; AVX512BW-NEXT: movq %rax, (%rsp) # 8-byte Spill
-; AVX512BW-NEXT: addq $-1, %r12
-; AVX512BW-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: movl $0, %r12d
-; AVX512BW-NEXT: adcq $-1, %r12
-; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512BW-NEXT: movl $0, %eax
-; AVX512BW-NEXT: adcq $-1, %rax
-; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512BW-NEXT: movl $0, %eax
-; AVX512BW-NEXT: adcq $-1, %rax
-; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512BW-NEXT: movl $0, %eax
-; AVX512BW-NEXT: adcq $-1, %rax
-; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512BW-NEXT: movl $0, %eax
-; AVX512BW-NEXT: adcq $-1, %rax
-; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: addq $-1, %r13
-; AVX512BW-NEXT: movl $0, %eax
-; AVX512BW-NEXT: adcq $-1, %rax
-; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: addq $-1, %rbx
-; AVX512BW-NEXT: movl $0, %eax
-; AVX512BW-NEXT: adcq $-1, %rax
-; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: addq $-1, %r10
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: adcq $-1, %rdx
-; AVX512BW-NEXT: addq $-1, %r9
-; AVX512BW-NEXT: movl $0, %ecx
-; AVX512BW-NEXT: adcq $-1, %rcx
-; AVX512BW-NEXT: addq $-1, %r8
-; AVX512BW-NEXT: movl $0, %eax
-; AVX512BW-NEXT: adcq $-1, %rax
-; AVX512BW-NEXT: addq $-1, %rsi
-; AVX512BW-NEXT: movl $0, %ebp
-; AVX512BW-NEXT: adcq $-1, %rbp
-; AVX512BW-NEXT: shldq $63, %rsi, %rbp
-; AVX512BW-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: shldq $63, %r8, %rax
-; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: shldq $63, %r9, %rcx
-; AVX512BW-NEXT: movq %rcx, %rbp
-; AVX512BW-NEXT: shldq $63, %r10, %rdx
-; AVX512BW-NEXT: movq %rdx, %r9
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512BW-NEXT: shldq $63, %rbx, %r10
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512BW-NEXT: shldq $63, %r13, %r8
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512BW-NEXT: shldq $63, %rax, %r13
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512BW-NEXT: shldq $63, %rax, %rbx
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512BW-NEXT: shldq $63, %rax, %rsi
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512BW-NEXT: shldq $63, %rax, %rdx
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512BW-NEXT: shldq $63, %rax, %r12
-; AVX512BW-NEXT: movq (%rsp), %rcx # 8-byte Reload
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512BW-NEXT: shldq $63, %rax, %rcx
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512BW-NEXT: shldq $63, %rax, %r14
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512BW-NEXT: shldq $63, %rax, %r15
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512BW-NEXT: shldq $63, %rax, %rdi
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512BW-NEXT: shldq $63, %rax, %r11
-; AVX512BW-NEXT: vmovq %r11, %xmm0
-; AVX512BW-NEXT: vmovq %rdi, %xmm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vmovd %xmm1, %eax
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm0, %xmm1
-; AVX512BW-NEXT: vmovq %r15, %xmm2
-; AVX512BW-NEXT: vmovq %r14, %xmm3
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0
-; AVX512BW-NEXT: vmovq %rcx, %xmm1
-; AVX512BW-NEXT: vmovq %r12, %xmm2
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512BW-NEXT: vmovd %xmm1, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovq %rdx, %xmm2
-; AVX512BW-NEXT: vmovq %rsi, %xmm3
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm2
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; AVX512BW-NEXT: vmovd %xmm1, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovq %rbx, %xmm1
-; AVX512BW-NEXT: vmovq %r13, %xmm2
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512BW-NEXT: vmovd %xmm1, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovq %r8, %xmm2
-; AVX512BW-NEXT: vmovq %r10, %xmm3
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm2
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; AVX512BW-NEXT: vmovd %xmm1, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovq %r9, %xmm1
-; AVX512BW-NEXT: vmovq %rbp, %xmm2
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512BW-NEXT: vmovd %xmm1, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 8-byte Folded Reload
-; AVX512BW-NEXT: # xmm2 = mem[0],zero
-; AVX512BW-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 8-byte Folded Reload
-; AVX512BW-NEXT: # xmm3 = mem[0],zero
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm2
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; AVX512BW-NEXT: vmovd %xmm1, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovdqu %xmm0, (%rax)
-; AVX512BW-NEXT: addq $24, %rsp
-; AVX512BW-NEXT: popq %rbx
-; AVX512BW-NEXT: popq %r12
-; AVX512BW-NEXT: popq %r13
-; AVX512BW-NEXT: popq %r14
-; AVX512BW-NEXT: popq %r15
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: not_avg_v16i8_wide_constants:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %r13
+; AVX512-NEXT: pushq %r12
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: subq $24, %rsp
+; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512-NEXT: vmovq %xmm3, %rbx
+; AVX512-NEXT: vpextrq $1, %xmm3, %rbp
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX512-NEXT: vmovq %xmm3, %rdi
+; AVX512-NEXT: vpextrq $1, %xmm3, %rsi
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512-NEXT: vmovq %xmm2, %rdx
+; AVX512-NEXT: vpextrq $1, %xmm2, %r15
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT: vmovq %xmm2, %r8
+; AVX512-NEXT: vpextrq $1, %xmm2, %r9
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512-NEXT: vmovq %xmm2, %r11
+; AVX512-NEXT: vpextrq $1, %xmm2, %r10
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512-NEXT: vmovq %xmm3, %rcx
+; AVX512-NEXT: addq %rbx, %rcx
+; AVX512-NEXT: vpextrq $1, %xmm3, %rax
+; AVX512-NEXT: addq %rbp, %rax
+; AVX512-NEXT: movq %rax, %rbp
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX512-NEXT: vmovq %xmm3, %r14
+; AVX512-NEXT: addq %rdi, %r14
+; AVX512-NEXT: vpextrq $1, %xmm3, %rax
+; AVX512-NEXT: addq %rsi, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512-NEXT: vmovq %xmm2, %rax
+; AVX512-NEXT: addq %rdx, %rax
+; AVX512-NEXT: movq %rax, %rdx
+; AVX512-NEXT: vpextrq $1, %xmm2, %r12
+; AVX512-NEXT: addq %r15, %r12
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT: vmovq %xmm2, %rax
+; AVX512-NEXT: addq %r8, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: vpextrq $1, %xmm2, %rax
+; AVX512-NEXT: addq %r9, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT: vmovq %xmm2, %rax
+; AVX512-NEXT: addq %r11, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: vpextrq $1, %xmm2, %rax
+; AVX512-NEXT: addq %r10, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT: vmovq %xmm2, %r13
+; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX512-NEXT: vpextrq $1, %xmm2, %rbx
+; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT: vmovq %xmm0, %r10
+; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX512-NEXT: vpextrq $1, %xmm0, %r9
+; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX512-NEXT: vmovq %xmm1, %rax
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovq %xmm0, %r8
+; AVX512-NEXT: addq %rax, %r8
+; AVX512-NEXT: vpextrq $1, %xmm1, %rdi
+; AVX512-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512-NEXT: addq %rdi, %rsi
+; AVX512-NEXT: addq $-1, %rcx
+; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movl $0, %r11d
+; AVX512-NEXT: adcq $-1, %r11
+; AVX512-NEXT: addq $-1, %rbp
+; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movl $0, %edi
+; AVX512-NEXT: adcq $-1, %rdi
+; AVX512-NEXT: addq $-1, %r14
+; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movl $0, %r15d
+; AVX512-NEXT: adcq $-1, %r15
+; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movl $0, %r14d
+; AVX512-NEXT: adcq $-1, %r14
+; AVX512-NEXT: addq $-1, %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movl $0, %eax
+; AVX512-NEXT: adcq $-1, %rax
+; AVX512-NEXT: movq %rax, (%rsp) # 8-byte Spill
+; AVX512-NEXT: addq $-1, %r12
+; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movl $0, %r12d
+; AVX512-NEXT: adcq $-1, %r12
+; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movl $0, %eax
+; AVX512-NEXT: adcq $-1, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movl $0, %eax
+; AVX512-NEXT: adcq $-1, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movl $0, %eax
+; AVX512-NEXT: adcq $-1, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movl $0, %eax
+; AVX512-NEXT: adcq $-1, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: addq $-1, %r13
+; AVX512-NEXT: movl $0, %eax
+; AVX512-NEXT: adcq $-1, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: addq $-1, %rbx
+; AVX512-NEXT: movl $0, %eax
+; AVX512-NEXT: adcq $-1, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: addq $-1, %r10
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: adcq $-1, %rdx
+; AVX512-NEXT: addq $-1, %r9
+; AVX512-NEXT: movl $0, %ecx
+; AVX512-NEXT: adcq $-1, %rcx
+; AVX512-NEXT: addq $-1, %r8
+; AVX512-NEXT: movl $0, %eax
+; AVX512-NEXT: adcq $-1, %rax
+; AVX512-NEXT: addq $-1, %rsi
+; AVX512-NEXT: movl $0, %ebp
+; AVX512-NEXT: adcq $-1, %rbp
+; AVX512-NEXT: shldq $63, %rsi, %rbp
+; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq $63, %r8, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq $63, %r9, %rcx
+; AVX512-NEXT: movq %rcx, %rbp
+; AVX512-NEXT: shldq $63, %r10, %rdx
+; AVX512-NEXT: movq %rdx, %r9
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT: shldq $63, %rbx, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: shldq $63, %r13, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq $63, %rax, %r13
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT: shldq $63, %rax, %rbx
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512-NEXT: shldq $63, %rax, %rsi
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: shldq $63, %rax, %rdx
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: shldq $63, %rax, %r12
+; AVX512-NEXT: movq (%rsp), %rcx # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: shldq $63, %rax, %rcx
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: shldq $63, %rax, %r14
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: shldq $63, %rax, %r15
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: shldq $63, %rax, %rdi
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: shldq $63, %rax, %r11
+; AVX512-NEXT: vmovq %r11, %xmm0
+; AVX512-NEXT: vmovq %rdi, %xmm1
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpinsrb $1, %eax, %xmm0, %xmm1
+; AVX512-NEXT: vmovq %r15, %xmm2
+; AVX512-NEXT: vmovq %r14, %xmm3
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0
+; AVX512-NEXT: vmovq %rcx, %xmm1
+; AVX512-NEXT: vmovq %r12, %xmm2
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vmovq %rdx, %xmm2
+; AVX512-NEXT: vmovq %rsi, %xmm3
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vmovq %rbx, %xmm1
+; AVX512-NEXT: vmovq %r13, %xmm2
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vmovq %r8, %xmm2
+; AVX512-NEXT: vmovq %r10, %xmm3
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vmovq %r9, %xmm1
+; AVX512-NEXT: vmovq %rbp, %xmm2
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 8-byte Folded Reload
+; AVX512-NEXT: # xmm2 = mem[0],zero
+; AVX512-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 8-byte Folded Reload
+; AVX512-NEXT: # xmm3 = mem[0],zero
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu %xmm0, (%rax)
+; AVX512-NEXT: addq $24, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r12
+; AVX512-NEXT: popq %r13
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a
%2 = load <16 x i8>, <16 x i8>* %b
%3 = zext <16 x i8> %1 to <16 x i128>
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL --check-prefix=KNL-NEW
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=knl -x86-enable-old-knl-abi | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL --check-prefix=KNL-OLD
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL
; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=skx | FileCheck %s --check-prefix=ALL_X64 --check-prefix=SKX
; RUN: llc < %s -mtriple=i686-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=KNL_X32
; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=skx -fast-isel | FileCheck %s --check-prefix=FASTISEL
}
define void @test14(<32 x i16>* %x) {
-; KNL-NEW-LABEL: test14:
-; KNL-NEW: ## %bb.0:
-; KNL-NEW-NEXT: pushq %rbx
-; KNL-NEW-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEW-NEXT: .cfi_offset %rbx, -16
-; KNL-NEW-NEXT: movq %rdi, %rbx
-; KNL-NEW-NEXT: vmovaps (%rdi), %zmm0
-; KNL-NEW-NEXT: callq _test14_callee
-; KNL-NEW-NEXT: vmovaps %zmm0, (%rbx)
-; KNL-NEW-NEXT: popq %rbx
-; KNL-NEW-NEXT: retq
-;
-; KNL-OLD-LABEL: test14:
-; KNL-OLD: ## %bb.0:
-; KNL-OLD-NEXT: pushq %rbx
-; KNL-OLD-NEXT: .cfi_def_cfa_offset 16
-; KNL-OLD-NEXT: .cfi_offset %rbx, -16
-; KNL-OLD-NEXT: movq %rdi, %rbx
-; KNL-OLD-NEXT: vmovaps (%rdi), %ymm0
-; KNL-OLD-NEXT: vmovaps 32(%rdi), %ymm1
-; KNL-OLD-NEXT: callq _test14_callee
-; KNL-OLD-NEXT: vmovaps %ymm1, 32(%rbx)
-; KNL-OLD-NEXT: vmovaps %ymm0, (%rbx)
-; KNL-OLD-NEXT: popq %rbx
-; KNL-OLD-NEXT: retq
+; KNL-LABEL: test14:
+; KNL: ## %bb.0:
+; KNL-NEXT: pushq %rbx
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: .cfi_offset %rbx, -16
+; KNL-NEXT: movq %rdi, %rbx
+; KNL-NEXT: vmovaps (%rdi), %zmm0
+; KNL-NEXT: callq _test14_callee
+; KNL-NEXT: vmovaps %zmm0, (%rbx)
+; KNL-NEXT: popq %rbx
+; KNL-NEXT: retq
;
; SKX-LABEL: test14:
; SKX: ## %bb.0:
declare <32 x i16> @test14_callee(<32 x i16>)
define void @test15(<64 x i8>* %x) {
-; KNL-NEW-LABEL: test15:
-; KNL-NEW: ## %bb.0:
-; KNL-NEW-NEXT: pushq %rbx
-; KNL-NEW-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEW-NEXT: .cfi_offset %rbx, -16
-; KNL-NEW-NEXT: movq %rdi, %rbx
-; KNL-NEW-NEXT: vmovaps (%rdi), %zmm0
-; KNL-NEW-NEXT: callq _test15_callee
-; KNL-NEW-NEXT: vmovaps %zmm0, (%rbx)
-; KNL-NEW-NEXT: popq %rbx
-; KNL-NEW-NEXT: retq
-;
-; KNL-OLD-LABEL: test15:
-; KNL-OLD: ## %bb.0:
-; KNL-OLD-NEXT: pushq %rbx
-; KNL-OLD-NEXT: .cfi_def_cfa_offset 16
-; KNL-OLD-NEXT: .cfi_offset %rbx, -16
-; KNL-OLD-NEXT: movq %rdi, %rbx
-; KNL-OLD-NEXT: vmovaps (%rdi), %ymm0
-; KNL-OLD-NEXT: vmovaps 32(%rdi), %ymm1
-; KNL-OLD-NEXT: callq _test15_callee
-; KNL-OLD-NEXT: vmovaps %ymm1, 32(%rbx)
-; KNL-OLD-NEXT: vmovaps %ymm0, (%rbx)
-; KNL-OLD-NEXT: popq %rbx
-; KNL-OLD-NEXT: retq
+; KNL-LABEL: test15:
+; KNL: ## %bb.0:
+; KNL-NEXT: pushq %rbx
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: .cfi_offset %rbx, -16
+; KNL-NEXT: movq %rdi, %rbx
+; KNL-NEXT: vmovaps (%rdi), %zmm0
+; KNL-NEXT: callq _test15_callee
+; KNL-NEXT: vmovaps %zmm0, (%rbx)
+; KNL-NEXT: popq %rbx
+; KNL-NEXT: retq
;
; SKX-LABEL: test15:
; SKX: ## %bb.0:
define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_32x8mem_to_32x16:
; KNL: # %bb.0:
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
-; KNL-NEXT: vpand %ymm3, %ymm0, %ymm0
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
-; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: vpandq %zmm2, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_32x8mem_to_32x16:
;
; AVX512DQNOBW-LABEL: zext_32x8mem_to_32x16:
; AVX512DQNOBW: # %bb.0:
-; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm0, %ymm0
-; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1
-; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQNOBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512DQNOBW-NEXT: retq
%a = load <32 x i8>,<32 x i8> *%i,align 1
%x = zext <32 x i8> %a to <32 x i16>
define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_32x8mem_to_32x16:
; KNL: # %bb.0:
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT: vpmovsxbw (%rdi), %ymm2
-; KNL-NEXT: vpmovsxbw 16(%rdi), %ymm3
+; KNL-NEXT: vpmovsxbw 16(%rdi), %ymm2
+; KNL-NEXT: vpmovsxbw (%rdi), %ymm3
+; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
-; KNL-NEXT: vpand %ymm3, %ymm0, %ymm0
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
-; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: vpandq %zmm2, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: sext_32x8mem_to_32x16:
;
; AVX512DQNOBW-LABEL: sext_32x8mem_to_32x16:
; AVX512DQNOBW: # %bb.0:
-; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQNOBW-NEXT: vpmovsxbw (%rdi), %ymm2
-; AVX512DQNOBW-NEXT: vpmovsxbw 16(%rdi), %ymm3
+; AVX512DQNOBW-NEXT: vpmovsxbw 16(%rdi), %ymm2
+; AVX512DQNOBW-NEXT: vpmovsxbw (%rdi), %ymm3
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm0, %ymm0
-; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1
-; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQNOBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512DQNOBW-NEXT: retq
%a = load <32 x i8>,<32 x i8> *%i,align 1
%x = sext <32 x i8> %a to <32 x i16>
define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
; KNL-LABEL: zext_32x8_to_32x16:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_32x8_to_32x16:
;
; AVX512DQNOBW-LABEL: zext_32x8_to_32x16:
; AVX512DQNOBW: # %bb.0:
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQNOBW-NEXT: retq
%x = zext <32 x i8> %a to <32 x i16>
ret <32 x i16> %x
define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_32x8_to_32x16_mask:
; KNL: # %bb.0:
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
-; KNL-NEXT: vpand %ymm0, %ymm1, %ymm0
-; KNL-NEXT: vpsllw $15, %ymm2, %ymm1
-; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
-; KNL-NEXT: vpand %ymm3, %ymm1, %ymm1
-; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; KNL-NEXT: vpsllw $15, %ymm2, %ymm2
+; KNL-NEXT: vpsraw $15, %ymm2, %ymm2
+; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; KNL-NEXT: vpandq %zmm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_32x8_to_32x16_mask:
;
; AVX512DQNOBW-LABEL: zext_32x8_to_32x16_mask:
; AVX512DQNOBW: # %bb.0:
-; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1
-; AVX512DQNOBW-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX512DQNOBW-NEXT: vpsllw $15, %ymm2, %ymm1
-; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1
-; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQNOBW-NEXT: vpsllw $15, %ymm2, %ymm2
+; AVX512DQNOBW-NEXT: vpsraw $15, %ymm2, %ymm2
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQNOBW-NEXT: vpandq %zmm0, %zmm1, %zmm0
; AVX512DQNOBW-NEXT: retq
%x = zext <32 x i8> %a to <32 x i16>
%ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
; KNL-LABEL: sext_32x8_to_32x16:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpmovsxbw %xmm1, %ymm1
+; KNL-NEXT: vpmovsxbw %xmm0, %ymm1
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbw %xmm0, %ymm0
-; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: sext_32x8_to_32x16:
;
; AVX512DQNOBW-LABEL: sext_32x8_to_32x16:
; AVX512DQNOBW: # %bb.0:
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQNOBW-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm1
+; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQNOBW-NEXT: retq
%x = sext <32 x i8> %a to <32 x i16>
ret <32 x i16> %x
define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_32x8_to_32x16_mask:
; KNL: # %bb.0:
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; KNL-NEXT: vpmovsxbw %xmm0, %ymm3
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbw %xmm0, %ymm0
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
-; KNL-NEXT: vpand %ymm0, %ymm1, %ymm0
-; KNL-NEXT: vpsllw $15, %ymm2, %ymm1
-; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
-; KNL-NEXT: vpand %ymm3, %ymm1, %ymm1
-; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; KNL-NEXT: vpsllw $15, %ymm2, %ymm2
+; KNL-NEXT: vpsraw $15, %ymm2, %ymm2
+; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; KNL-NEXT: vpandq %zmm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: sext_32x8_to_32x16_mask:
;
; AVX512DQNOBW-LABEL: sext_32x8_to_32x16_mask:
; AVX512DQNOBW: # %bb.0:
-; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm3
; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1
-; AVX512DQNOBW-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX512DQNOBW-NEXT: vpsllw $15, %ymm2, %ymm1
-; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1
-; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQNOBW-NEXT: vpsllw $15, %ymm2, %ymm2
+; AVX512DQNOBW-NEXT: vpsraw $15, %ymm2, %ymm2
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQNOBW-NEXT: vpandq %zmm0, %zmm1, %zmm0
; AVX512DQNOBW-NEXT: retq
%x = sext <32 x i8> %a to <32 x i16>
%ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
; KNL: # %bb.0:
; KNL-NEXT: movw $-3, %ax
; KNL-NEXT: kmovw %eax, %k1
-; KNL-NEXT: kmovw %edi, %k0
+; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT: kmovw %eax, %k0
; KNL-NEXT: kandw %k1, %k0, %k0
; KNL-NEXT: kmovw %k1, %k2
; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT: kmovw %esi, %k1
+; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: kshiftlw $15, %k1, %k1
; KNL-NEXT: kshiftrw $14, %k1, %k1
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: kmovw %edx, %k1
+; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: kshiftlw $15, %k1, %k1
; KNL-NEXT: kshiftrw $13, %k1, %k1
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: kmovw %ecx, %k1
+; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: kshiftlw $15, %k1, %k1
; KNL-NEXT: kshiftrw $12, %k1, %k1
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kmovw %eax, %k6
; KNL-NEXT: kandw %k6, %k0, %k0
; KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT: kmovw %r8d, %k1
+; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: kshiftlw $15, %k1, %k1
; KNL-NEXT: kshiftrw $11, %k1, %k1
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kandw %k1, %k0, %k0
; KNL-NEXT: kmovw %k1, %k3
; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT: kmovw %r9d, %k1
+; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: kshiftlw $15, %k1, %k1
; KNL-NEXT: kshiftrw $10, %k1, %k1
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kshiftlw $15, %k7, %k7
; KNL-NEXT: korw %k7, %k1, %k1
; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: kandw %k2, %k1, %k1
-; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT: kmovw %eax, %k7
+; KNL-NEXT: kmovw %esi, %k7
; KNL-NEXT: kshiftlw $15, %k7, %k7
; KNL-NEXT: kshiftrw $14, %k7, %k7
; KNL-NEXT: korw %k7, %k1, %k1
; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; KNL-NEXT: kandw %k0, %k1, %k1
-; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT: kmovw %eax, %k7
+; KNL-NEXT: kmovw %edx, %k7
; KNL-NEXT: kshiftlw $15, %k7, %k7
; KNL-NEXT: kshiftrw $13, %k7, %k7
; KNL-NEXT: korw %k7, %k1, %k1
; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; KNL-NEXT: kandw %k2, %k1, %k1
-; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT: kmovw %eax, %k7
+; KNL-NEXT: kmovw %ecx, %k7
; KNL-NEXT: kshiftlw $15, %k7, %k7
; KNL-NEXT: kshiftrw $12, %k7, %k7
; KNL-NEXT: korw %k7, %k1, %k1
; KNL-NEXT: kandw %k6, %k1, %k1
-; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT: kmovw %eax, %k7
+; KNL-NEXT: kmovw %r8d, %k7
; KNL-NEXT: kshiftlw $15, %k7, %k7
; KNL-NEXT: kshiftrw $11, %k7, %k7
; KNL-NEXT: korw %k7, %k1, %k1
; KNL-NEXT: kandw %k3, %k1, %k1
-; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT: kmovw %eax, %k7
+; KNL-NEXT: kmovw %r9d, %k7
; KNL-NEXT: kshiftlw $15, %k7, %k7
; KNL-NEXT: kshiftrw $10, %k7, %k7
; KNL-NEXT: korw %k7, %k1, %k1
; KNL-NEXT: kmovw %eax, %k2
; KNL-NEXT: kshiftlw $15, %k2, %k2
; KNL-NEXT: korw %k2, %k0, %k2
-; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
-; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; KNL-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z}
-; KNL-NEXT: vpmovdw %zmm4, %ymm4
-; KNL-NEXT: vpand %ymm1, %ymm4, %ymm1
-; KNL-NEXT: vpmovdw %zmm5, %ymm4
-; KNL-NEXT: vpand %ymm2, %ymm4, %ymm2
-; KNL-NEXT: vpmovdw %zmm6, %ymm4
-; KNL-NEXT: vpand %ymm3, %ymm4, %ymm3
-; KNL-NEXT: vpmovdw %zmm7, %ymm4
-; KNL-NEXT: vpand %ymm0, %ymm4, %ymm0
+; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
+; KNL-NEXT: vpmovdw %zmm2, %ymm2
+; KNL-NEXT: vpmovdw %zmm3, %ymm3
+; KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; KNL-NEXT: vpandq %zmm1, %zmm2, %zmm1
+; KNL-NEXT: vpmovdw %zmm4, %ymm2
+; KNL-NEXT: vpmovdw %zmm5, %ymm3
+; KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; KNL-NEXT: vpandq %zmm0, %zmm2, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test21:
; AVX512DQNOBW: # %bb.0:
; AVX512DQNOBW-NEXT: movw $-3, %ax
; AVX512DQNOBW-NEXT: kmovw %eax, %k1
-; AVX512DQNOBW-NEXT: kmovw %edi, %k0
+; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT: kmovw %eax, %k0
; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0
; AVX512DQNOBW-NEXT: kmovw %k1, %k2
; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT: kmovw %esi, %k1
+; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT: kmovw %eax, %k1
; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1
; AVX512DQNOBW-NEXT: kshiftrw $14, %k1, %k1
; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0
; AVX512DQNOBW-NEXT: kmovw %eax, %k1
; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT: kmovw %edx, %k1
+; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT: kmovw %eax, %k1
; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1
; AVX512DQNOBW-NEXT: kshiftrw $13, %k1, %k1
; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0
; AVX512DQNOBW-NEXT: kmovw %eax, %k1
; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT: kmovw %ecx, %k1
+; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT: kmovw %eax, %k1
; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1
; AVX512DQNOBW-NEXT: kshiftrw $12, %k1, %k1
; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0
; AVX512DQNOBW-NEXT: kmovw %eax, %k6
; AVX512DQNOBW-NEXT: kandw %k6, %k0, %k0
; AVX512DQNOBW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT: kmovw %r8d, %k1
+; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT: kmovw %eax, %k1
; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1
; AVX512DQNOBW-NEXT: kshiftrw $11, %k1, %k1
; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0
; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0
; AVX512DQNOBW-NEXT: kmovw %k1, %k3
; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT: kmovw %r9d, %k1
+; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT: kmovw %eax, %k1
; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1
; AVX512DQNOBW-NEXT: kshiftrw $10, %k1, %k1
; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0
; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7
; AVX512DQNOBW-NEXT: korw %k7, %k0, %k0
; AVX512DQNOBW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT: kmovw %eax, %k0
+; AVX512DQNOBW-NEXT: kmovw %edi, %k0
; AVX512DQNOBW-NEXT: kandw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT: kmovw %eax, %k7
+; AVX512DQNOBW-NEXT: kmovw %esi, %k7
; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7
; AVX512DQNOBW-NEXT: kshiftrw $14, %k7, %k7
; AVX512DQNOBW-NEXT: korw %k7, %k0, %k0
; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT: kmovw %eax, %k7
+; AVX512DQNOBW-NEXT: kmovw %edx, %k7
; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7
; AVX512DQNOBW-NEXT: kshiftrw $13, %k7, %k7
; AVX512DQNOBW-NEXT: korw %k7, %k0, %k0
; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512DQNOBW-NEXT: kandw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT: kmovw %eax, %k7
+; AVX512DQNOBW-NEXT: kmovw %ecx, %k7
; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7
; AVX512DQNOBW-NEXT: kshiftrw $12, %k7, %k7
; AVX512DQNOBW-NEXT: korw %k7, %k0, %k0
; AVX512DQNOBW-NEXT: kandw %k6, %k0, %k0
-; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT: kmovw %eax, %k7
+; AVX512DQNOBW-NEXT: kmovw %r8d, %k7
; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7
; AVX512DQNOBW-NEXT: kshiftrw $11, %k7, %k7
; AVX512DQNOBW-NEXT: korw %k7, %k0, %k0
; AVX512DQNOBW-NEXT: kandw %k3, %k0, %k0
-; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT: kmovw %eax, %k7
+; AVX512DQNOBW-NEXT: kmovw %r9d, %k7
; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7
; AVX512DQNOBW-NEXT: kshiftrw $10, %k7, %k7
; AVX512DQNOBW-NEXT: korw %k7, %k0, %k0
; AVX512DQNOBW-NEXT: kmovw %eax, %k2
; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2
; AVX512DQNOBW-NEXT: korw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT: vpmovm2d %k1, %zmm4
-; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm5
+; AVX512DQNOBW-NEXT: vpmovm2d %k1, %zmm2
+; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm3
; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm6
+; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm4
; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm7
-; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4
-; AVX512DQNOBW-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX512DQNOBW-NEXT: vpmovdw %zmm5, %ymm4
-; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm4, %ymm2
-; AVX512DQNOBW-NEXT: vpmovdw %zmm6, %ymm4
-; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm4, %ymm3
-; AVX512DQNOBW-NEXT: vpmovdw %zmm7, %ymm4
-; AVX512DQNOBW-NEXT: vpand %ymm0, %ymm4, %ymm0
+; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm5
+; AVX512DQNOBW-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512DQNOBW-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512DQNOBW-NEXT: vpandq %zmm1, %zmm2, %zmm1
+; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm2
+; AVX512DQNOBW-NEXT: vpmovdw %zmm5, %ymm3
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512DQNOBW-NEXT: vpandq %zmm0, %zmm2, %zmm0
; AVX512DQNOBW-NEXT: retq
%ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer
ret <64 x i16> %ret
; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; KNL-NEXT: vpcmpeqb %ymm2, %ymm3, %ymm2
-; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; KNL-NEXT: vpand %ymm3, %ymm2, %ymm2
; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpand %ymm3, %ymm0, %ymm0
; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_64xi1_to_64xi8:
; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512DQNOBW-NEXT: vpcmpeqb %ymm2, %ymm3, %ymm2
-; AVX512DQNOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512DQNOBW-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQNOBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512DQNOBW-NEXT: retq
%mask = icmp eq <64 x i8> %x, %y
%1 = zext <64 x i1> %mask to <64 x i8>
; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; KNL-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2
-; KNL-NEXT: vpsrlw $15, %ymm2, %ymm2
; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0
; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_32xi1_to_32xi16:
; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512DQNOBW-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2
-; AVX512DQNOBW-NEXT: vpsrlw $15, %ymm2, %ymm2
; AVX512DQNOBW-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
-; AVX512DQNOBW-NEXT: vpsrlw $15, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQNOBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512DQNOBW-NEXT: retq
%mask = icmp eq <32 x i16> %x, %y
%1 = zext <32 x i1> %mask to <32 x i16>
}
define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) {
-; KNL-LABEL: insert_v32i16:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; KNL-NEXT: vpinsrw $1, %edi, %xmm2, %xmm2
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; KNL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; KNL-NEXT: retq
-;
-; SKX-LABEL: insert_v32i16:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
-; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
-; SKX-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
-; SKX-NEXT: retq
+; CHECK-LABEL: insert_v32i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
+; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%val = load i16, i16* %ptr
%r1 = insertelement <32 x i16> %x, i16 %val, i32 1
%r2 = insertelement <32 x i16> %r1, i16 %y, i32 9
}
define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) {
-; KNL-LABEL: insert_v64i8:
-; KNL: ## %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; KNL-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm2
-; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; KNL-NEXT: vpinsrb $2, %edi, %xmm2, %xmm2
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: insert_v64i8:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
-; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
-; SKX-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; SKX-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0
-; SKX-NEXT: vinserti32x4 $3, %xmm0, %zmm1, %zmm0
-; SKX-NEXT: retq
+; CHECK-LABEL: insert_v64i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
+; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
+; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm0
+; CHECK-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vinserti32x4 $3, %xmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%val = load i8, i8* %ptr
%r1 = insertelement <64 x i8> %x, i8 %val, i32 1
%r2 = insertelement <64 x i8> %r1, i8 %y, i32 50
define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: test_extractelement_v64i1:
; KNL: ## %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpminub %xmm1, %xmm0, %xmm1
; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: extractelement_v64i1_alt:
; KNL: ## %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpminub %xmm1, %xmm0, %xmm1
; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
}
define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v32i16:
-; KNL: ## %bb.0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
-; KNL-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vmovaps %ymm0, (%rsp)
-; KNL-NEXT: andl $31, %edi
-; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v32i16:
-; SKX: ## %bb.0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: .cfi_offset %rbp, -16
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
-; SKX-NEXT: vmovaps %zmm0, (%rsp)
-; SKX-NEXT: andl $31, %edi
-; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v32i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-64, %rsp
+; CHECK-NEXT: subq $128, %rsp
+; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
+; CHECK-NEXT: vmovaps %zmm0, (%rsp)
+; CHECK-NEXT: andl $31, %edi
+; CHECK-NEXT: movzwl (%rsp,%rdi,2), %eax
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%t2 = extractelement <32 x i16> %t1, i32 %index
ret i16 %t2
}
}
define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v64i8:
-; KNL: ## %bb.0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
-; KNL-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vmovaps %ymm0, (%rsp)
-; KNL-NEXT: andl $63, %edi
-; KNL-NEXT: movb (%rsp,%rdi), %al
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v64i8:
-; SKX: ## %bb.0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: .cfi_offset %rbp, -16
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
-; SKX-NEXT: vmovaps %zmm0, (%rsp)
-; SKX-NEXT: andl $63, %edi
-; SKX-NEXT: movb (%rsp,%rdi), %al
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v64i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-64, %rsp
+; CHECK-NEXT: subq $128, %rsp
+; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
+; CHECK-NEXT: vmovaps %zmm0, (%rsp)
+; CHECK-NEXT: andl $63, %edi
+; CHECK-NEXT: movb (%rsp,%rdi), %al
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%t2 = extractelement <64 x i8> %t1, i32 %index
ret i8 %t2
}
define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index) {
-; KNL-LABEL: test_extractelement_variable_v64i8_indexi8:
-; KNL: ## %bb.0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: addb %dil, %dil
-; KNL-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vmovaps %ymm0, (%rsp)
-; KNL-NEXT: movzbl %dil, %eax
-; KNL-NEXT: andl $63, %eax
-; KNL-NEXT: movb (%rsp,%rax), %al
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v64i8_indexi8:
-; SKX: ## %bb.0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: .cfi_offset %rbp, -16
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: addb %dil, %dil
-; SKX-NEXT: vmovaps %zmm0, (%rsp)
-; SKX-NEXT: movzbl %dil, %eax
-; SKX-NEXT: andl $63, %eax
-; SKX-NEXT: movb (%rsp,%rax), %al
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v64i8_indexi8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-64, %rsp
+; CHECK-NEXT: subq $128, %rsp
+; CHECK-NEXT: addb %dil, %dil
+; CHECK-NEXT: vmovaps %zmm0, (%rsp)
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: andl $63, %eax
+; CHECK-NEXT: movb (%rsp,%rax), %al
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%i = add i8 %index, %index
%t2 = extractelement <64 x i8> %t1, i8 %i
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm2
+; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
-; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
; KNL-NEXT: andl $63, %esi
; KNL-NEXT: testb %dil, %dil
-; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vmovdqa %ymm0, (%rsp)
+; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm2, (%rsp)
; KNL-NEXT: setne (%rsp,%rsi)
; KNL-NEXT: vpmovsxbd (%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-128, %rsp
; KNL-NEXT: subq $256, %rsp ## imm = 0x100
-; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; KNL-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $3, 248(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $4, 256(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $5, 264(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $6, 272(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $7, 280(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $8, 288(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $9, 296(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $10, 304(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $11, 312(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $12, 320(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $13, 328(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $14, 336(%rbp), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $15, 344(%rbp), %xmm0, %xmm0
+; KNL-NEXT: movl 744(%rbp), %eax
+; KNL-NEXT: andl $127, %eax
+; KNL-NEXT: vmovd %edi, %xmm0
+; KNL-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $4, %r8d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $6, 16(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $7, 24(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $8, 32(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $9, 40(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $10, 48(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $11, 56(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $12, 64(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $13, 72(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $14, 80(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $15, 88(%rbp), %xmm0, %xmm0
; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; KNL-NEXT: vpinsrb $1, 360(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $2, 368(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $3, 376(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $4, 384(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $5, 392(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $6, 400(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $7, 408(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $8, 416(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $9, 424(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $10, 432(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $11, 440(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $12, 448(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $13, 456(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $14, 464(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $15, 472(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $1, 104(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $2, 112(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $3, 120(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $4, 128(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $5, 136(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $6, 144(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $7, 152(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $8, 160(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $9, 168(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $10, 176(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $11, 184(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $12, 192(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $13, 200(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $14, 208(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $15, 216(%rbp), %xmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; KNL-NEXT: vmovd %edi, %xmm2
-; KNL-NEXT: vpinsrb $1, %esi, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $4, %r8d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $5, %r9d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $6, 16(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $7, 24(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $8, 32(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $9, 40(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $10, 48(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $11, 56(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $12, 64(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $13, 72(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $14, 80(%rbp), %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $15, 88(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $1, 232(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $2, 240(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $3, 248(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $4, 256(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $5, 264(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $6, 272(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $7, 280(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $8, 288(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $9, 296(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $10, 304(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $11, 312(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $12, 320(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $13, 328(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $14, 336(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $15, 344(%rbp), %xmm2, %xmm2
; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; KNL-NEXT: vpinsrb $1, 104(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $2, 112(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $3, 120(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $4, 128(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $5, 136(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $6, 144(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $7, 152(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $8, 160(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $9, 168(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $10, 176(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $11, 184(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $12, 192(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $13, 200(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $14, 208(%rbp), %xmm3, %xmm3
-; KNL-NEXT: vpinsrb $15, 216(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $1, 360(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $2, 368(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $3, 376(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $4, 384(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $5, 392(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $6, 400(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $7, 408(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $8, 416(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $9, 424(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $10, 432(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $11, 440(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $12, 448(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $13, 456(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $14, 464(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $15, 472(%rbp), %xmm3, %xmm3
; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; KNL-NEXT: movl 744(%rbp), %eax
-; KNL-NEXT: andl $127, %eax
-; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; KNL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
+; KNL-NEXT: vpcmpeqb %ymm1, %ymm2, %ymm2
; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2
-; KNL-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm4, %xmm4
+; KNL-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; KNL-NEXT: vpcmpeqb %ymm1, %ymm3, %ymm1
; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
-; KNL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
-; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: cmpb $0, 736(%rbp)
-; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vmovdqa %ymm2, (%rsp)
+; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm0, (%rsp)
; KNL-NEXT: setne (%rsp,%rax)
; KNL-NEXT: vpmovsxbd (%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: andq $-128, %rsp
; KNL-NEXT: subq $256, %rsp ## imm = 0x100
; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
-; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
+; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3
+; KNL-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1
+; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm4
+; KNL-NEXT: vpternlogq $15, %zmm4, %zmm4, %zmm4
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2
-; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3
-; KNL-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3
; KNL-NEXT: andl $127, %esi
; KNL-NEXT: testb %dil, %dil
-; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vmovdqa %ymm0, (%rsp)
+; KNL-NEXT: vmovdqa %ymm4, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm3, (%rsp)
; KNL-NEXT: setne (%rsp,%rsi)
; KNL-NEXT: vpmovsxbd (%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
define <64 x i8> @and_v64i8(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: and_v64i8:
; KNL: ## %bb.0:
-; KNL-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: and_v64i8:
define <64 x i8> @andn_v64i8(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: andn_v64i8:
; KNL: ## %bb.0:
-; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
-; KNL-NEXT: vandnps %ymm2, %ymm3, %ymm2
-; KNL-NEXT: vandnps %ymm0, %ymm1, %ymm0
-; KNL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL-NEXT: vpandnq %zmm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: andn_v64i8:
define <64 x i8> @or_v64i8(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: or_v64i8:
; KNL: ## %bb.0:
-; KNL-NEXT: vpord %zmm1, %zmm0, %zmm0
+; KNL-NEXT: vporq %zmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: or_v64i8:
define <64 x i8> @xor_v64i8(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: xor_v64i8:
; KNL: ## %bb.0:
-; KNL-NEXT: vpxord %zmm1, %zmm0, %zmm0
+; KNL-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: xor_v64i8:
define <32 x i16> @and_v32i16(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: and_v32i16:
; KNL: ## %bb.0:
-; KNL-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: and_v32i16:
define <32 x i16> @andn_v32i16(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: andn_v32i16:
; KNL: ## %bb.0:
-; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
-; KNL-NEXT: vandnps %ymm2, %ymm3, %ymm2
-; KNL-NEXT: vandnps %ymm0, %ymm1, %ymm0
-; KNL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL-NEXT: vpandnq %zmm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: andn_v32i16:
define <32 x i16> @or_v32i16(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: or_v32i16:
; KNL: ## %bb.0:
-; KNL-NEXT: vpord %zmm1, %zmm0, %zmm0
+; KNL-NEXT: vporq %zmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: or_v32i16:
define <32 x i16> @xor_v32i16(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: xor_v32i16:
; KNL: ## %bb.0:
-; KNL-NEXT: vpxord %zmm1, %zmm0, %zmm0
+; KNL-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: xor_v32i16:
define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
; KNL-LABEL: test21:
; KNL: ## %bb.0:
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
-; KNL-NEXT: vpand %ymm3, %ymm1, %ymm1
; KNL-NEXT: vpsllw $15, %ymm2, %ymm2
; KNL-NEXT: vpsraw $15, %ymm2, %ymm2
-; KNL-NEXT: vpand %ymm0, %ymm2, %ymm0
-; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; KNL-NEXT: vpandq %zmm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test21:
;
; AVX512DQ-LABEL: test21:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512DQ-NEXT: vpsllw $15, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsraw $15, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsllw $15, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsraw $15, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpand %ymm0, %ymm2, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpandq %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test21:
define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) {
; KNL-LABEL: test_build_vec_v32i1:
; KNL: ## %bb.0:
-; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
+; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_build_vec_v32i1:
;
; AVX512DQ-LABEL: test_build_vec_v32i1:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
+; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_build_vec_v32i1:
define <32 x i16> @test_build_vec_v32i1_optsize(<32 x i16> %x) optsize {
; KNL-LABEL: test_build_vec_v32i1_optsize:
; KNL: ## %bb.0:
-; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
+; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_build_vec_v32i1_optsize:
;
; AVX512DQ-LABEL: test_build_vec_v32i1_optsize:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
+; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_build_vec_v32i1_optsize:
define <32 x i16> @test_build_vec_v32i1_pgso(<32 x i16> %x) !prof !14 {
; KNL-LABEL: test_build_vec_v32i1_pgso:
; KNL: ## %bb.0:
-; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
+; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_build_vec_v32i1_pgso:
;
; AVX512DQ-LABEL: test_build_vec_v32i1_pgso:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
+; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_build_vec_v32i1_pgso:
define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {
; KNL-LABEL: test_build_vec_v64i1:
; KNL: ## %bb.0:
-; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
+; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_build_vec_v64i1:
;
; AVX512DQ-LABEL: test_build_vec_v64i1:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
+; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_build_vec_v64i1:
; KNL: ## %bb.0:
; KNL-NEXT: kmovw (%rdi), %k1
; KNL-NEXT: kmovw 2(%rdi), %k2
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; KNL-NEXT: vpmovdw %zmm1, %ymm1
-; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: load_32i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: kmovw 2(%rdi), %k1
-; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1
; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: load_32i1:
define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
; KNL-LABEL: store_32i1_1:
; KNL: ## %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; KNL-NEXT: vpmovsxwd %ymm0, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: vpmovsxwd %ymm1, %zmm0
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: kmovw %k1, 2(%rdi)
; KNL-NEXT: kmovw %k0, (%rdi)
;
; AVX512DQ-LABEL: store_32i1_1:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm1
+; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
-; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm0
-; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: kmovw %k1, 2(%rdi)
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rax
; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm5
-; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm6
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm7
-; KNL-NEXT: vpxor %xmm8, %xmm8, %xmm8
-; KNL-NEXT: vpcmpeqw %ymm0, %ymm8, %ymm0
-; KNL-NEXT: vpcmpeqw %ymm7, %ymm8, %ymm7
-; KNL-NEXT: vpcmpeqw %ymm1, %ymm8, %ymm1
-; KNL-NEXT: vpor %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpcmpeqw %ymm6, %ymm8, %ymm1
-; KNL-NEXT: vpor %ymm1, %ymm7, %ymm1
-; KNL-NEXT: vpcmpeqw %ymm2, %ymm8, %ymm2
-; KNL-NEXT: vpcmpeqw %ymm5, %ymm8, %ymm5
-; KNL-NEXT: vpcmpeqw %ymm3, %ymm8, %ymm3
-; KNL-NEXT: vpor %ymm3, %ymm2, %ymm2
-; KNL-NEXT: vpand %ymm2, %ymm0, %ymm0
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm8, %ymm2
-; KNL-NEXT: vpor %ymm2, %ymm5, %ymm2
-; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; KNL-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; KNL-NEXT: vpcmpeqw %ymm5, %ymm4, %ymm4
+; KNL-NEXT: vpcmpeqw %ymm5, %ymm0, %ymm0
+; KNL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; KNL-NEXT: vpcmpeqw %ymm5, %ymm4, %ymm4
+; KNL-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; KNL-NEXT: vporq %zmm1, %zmm0, %zmm0
+; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm1
+; KNL-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm1
+; KNL-NEXT: vpcmpeqw %ymm5, %ymm2, %ymm2
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; KNL-NEXT: vextracti64x4 $1, %zmm3, %ymm2
+; KNL-NEXT: vpcmpeqw %ymm5, %ymm2, %ymm2
+; KNL-NEXT: vpcmpeqw %ymm5, %ymm3, %ymm3
+; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; KNL-NEXT: vporq %zmm2, %zmm1, %zmm1
+; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; KNL-NEXT: vpmovsxwd %ymm0, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpmovsxwd %ymm1, %zmm0
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: pushq %rax
; AVX512DQ-NEXT: .cfi_def_cfa_offset 16
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm5
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm6
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm7
-; AVX512DQ-NEXT: vpxor %xmm8, %xmm8, %xmm8
-; AVX512DQ-NEXT: vpcmpeqw %ymm0, %ymm8, %ymm0
-; AVX512DQ-NEXT: vpcmpeqw %ymm7, %ymm8, %ymm7
-; AVX512DQ-NEXT: vpcmpeqw %ymm1, %ymm8, %ymm1
-; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpcmpeqw %ymm6, %ymm8, %ymm1
-; AVX512DQ-NEXT: vpor %ymm1, %ymm7, %ymm1
-; AVX512DQ-NEXT: vpcmpeqw %ymm2, %ymm8, %ymm2
-; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm8, %ymm5
-; AVX512DQ-NEXT: vpcmpeqw %ymm3, %ymm8, %ymm3
-; AVX512DQ-NEXT: vpor %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpcmpeqw %ymm4, %ymm8, %ymm2
-; AVX512DQ-NEXT: vpor %ymm2, %ymm5, %ymm2
-; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1
+; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm2
+; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm3, %ymm3
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT: vporq %zmm2, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm1
+; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: kmovw %k0, %ecx
; AVX512DQ-NEXT: shll $16, %ecx
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rax
; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: vextracti64x4 $1, %zmm3, %ymm9
-; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm10
-; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm11
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm7
-; KNL-NEXT: vpxor %xmm8, %xmm8, %xmm8
-; KNL-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm13
-; KNL-NEXT: vextracti128 $1, %ymm13, %xmm4
-; KNL-NEXT: vpcmpeqb %ymm7, %ymm8, %ymm7
-; KNL-NEXT: vextracti128 $1, %ymm7, %xmm5
-; KNL-NEXT: vpcmpeqb %ymm1, %ymm8, %ymm1
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm6
-; KNL-NEXT: vpor %xmm6, %xmm4, %xmm12
-; KNL-NEXT: vpcmpeqb %ymm8, %ymm11, %ymm6
-; KNL-NEXT: vextracti128 $1, %ymm6, %xmm4
-; KNL-NEXT: vpor %xmm4, %xmm5, %xmm11
-; KNL-NEXT: vpcmpeqb %ymm2, %ymm8, %ymm2
-; KNL-NEXT: vextracti128 $1, %ymm2, %xmm5
-; KNL-NEXT: vpcmpeqb %ymm8, %ymm10, %ymm10
-; KNL-NEXT: vextracti128 $1, %ymm10, %xmm4
-; KNL-NEXT: vpcmpeqb %ymm3, %ymm8, %ymm3
-; KNL-NEXT: vextracti128 $1, %ymm3, %xmm0
-; KNL-NEXT: vpor %xmm0, %xmm5, %xmm0
-; KNL-NEXT: vpand %xmm0, %xmm12, %xmm12
-; KNL-NEXT: vpcmpeqb %ymm8, %ymm9, %ymm5
-; KNL-NEXT: vextracti128 $1, %ymm5, %xmm0
-; KNL-NEXT: vpor %xmm0, %xmm4, %xmm0
-; KNL-NEXT: vpand %xmm0, %xmm11, %xmm0
-; KNL-NEXT: vpor %xmm6, %xmm7, %xmm4
-; KNL-NEXT: vpor %xmm1, %xmm13, %xmm1
-; KNL-NEXT: vpor %xmm5, %xmm10, %xmm5
-; KNL-NEXT: vpand %xmm5, %xmm4, %xmm4
-; KNL-NEXT: vpor %xmm3, %xmm2, %xmm2
-; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpmovsxbd %xmm12, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: shll $16, %ecx
-; KNL-NEXT: orl %eax, %ecx
-; KNL-NEXT: vpmovsxbd %xmm4, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: shll $16, %edx
-; KNL-NEXT: orl %eax, %edx
-; KNL-NEXT: shlq $32, %rdx
-; KNL-NEXT: orq %rcx, %rdx
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; KNL-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; KNL-NEXT: vpcmpeqb %ymm5, %ymm4, %ymm4
+; KNL-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm0
+; KNL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; KNL-NEXT: vpcmpeqb %ymm5, %ymm4, %ymm4
+; KNL-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; KNL-NEXT: vporq %zmm1, %zmm0, %zmm0
+; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm1
+; KNL-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm1
+; KNL-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm2
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; KNL-NEXT: vextracti64x4 $1, %zmm3, %ymm2
+; KNL-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm2
+; KNL-NEXT: vpcmpeqb %ymm5, %ymm3, %ymm3
+; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; KNL-NEXT: vporq %zmm2, %zmm1, %zmm1
+; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; KNL-NEXT: vpmovmskb %ymm0, %eax
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vpmovmskb %ymm0, %ecx
+; KNL-NEXT: shlq $32, %rcx
+; KNL-NEXT: orq %rax, %rcx
; KNL-NEXT: je LBB78_1
; KNL-NEXT: ## %bb.2: ## %exit
; KNL-NEXT: popq %rax
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: pushq %rax
; AVX512DQ-NEXT: .cfi_def_cfa_offset 16
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm9
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm10
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm11
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm7
-; AVX512DQ-NEXT: vpxor %xmm8, %xmm8, %xmm8
-; AVX512DQ-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm13
-; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm4
-; AVX512DQ-NEXT: vpcmpeqb %ymm7, %ymm8, %ymm7
-; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm5
-; AVX512DQ-NEXT: vpcmpeqb %ymm1, %ymm8, %ymm1
-; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm6
-; AVX512DQ-NEXT: vpor %xmm6, %xmm4, %xmm12
-; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm11, %ymm6
-; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm4
-; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm11
-; AVX512DQ-NEXT: vpcmpeqb %ymm2, %ymm8, %ymm2
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm5
-; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm10, %ymm10
-; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm4
-; AVX512DQ-NEXT: vpcmpeqb %ymm3, %ymm8, %ymm3
-; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm0
-; AVX512DQ-NEXT: vpor %xmm0, %xmm5, %xmm0
-; AVX512DQ-NEXT: vpand %xmm0, %xmm12, %xmm12
-; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm9, %ymm5
-; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm0
-; AVX512DQ-NEXT: vpor %xmm0, %xmm4, %xmm0
-; AVX512DQ-NEXT: vpand %xmm0, %xmm11, %xmm0
-; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm4
-; AVX512DQ-NEXT: vpor %xmm1, %xmm13, %xmm1
-; AVX512DQ-NEXT: vpor %xmm5, %xmm10, %xmm5
-; AVX512DQ-NEXT: vpand %xmm5, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1
-; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpmovsxbd %xmm12, %zmm1
-; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
-; AVX512DQ-NEXT: kmovw %k0, %ecx
-; AVX512DQ-NEXT: shll $16, %ecx
-; AVX512DQ-NEXT: orl %eax, %ecx
-; AVX512DQ-NEXT: vpmovsxbd %xmm4, %zmm1
-; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %edx
-; AVX512DQ-NEXT: shll $16, %edx
-; AVX512DQ-NEXT: orl %eax, %edx
-; AVX512DQ-NEXT: shlq $32, %rdx
-; AVX512DQ-NEXT: orq %rcx, %rdx
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1
+; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm2
+; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm3, %ymm3
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT: vporq %zmm2, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovmskb %ymm0, %eax
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512DQ-NEXT: vpmovmskb %ymm0, %ecx
+; AVX512DQ-NEXT: shlq $32, %rcx
+; AVX512DQ-NEXT: orq %rax, %rcx
; AVX512DQ-NEXT: je LBB78_1
; AVX512DQ-NEXT: ## %bb.2: ## %exit
; AVX512DQ-NEXT: popq %rax
}
define <32 x i16> @pr42355_v32i16(i1 %c, <32 x i16> %x, <32 x i16> %y) {
-; X86-AVX512F-LABEL: pr42355_v32i16:
-; X86-AVX512F: # %bb.0:
-; X86-AVX512F-NEXT: testb $1, {{[0-9]+}}(%esp)
-; X86-AVX512F-NEXT: jne .LBB14_1
-; X86-AVX512F-NEXT: # %bb.2:
-; X86-AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm2
-; X86-AVX512F-NEXT: vmovaps %ymm1, %ymm0
-; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; X86-AVX512F-NEXT: retl
-; X86-AVX512F-NEXT: .LBB14_1:
-; X86-AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; X86-AVX512F-NEXT: retl
-;
-; X64-AVX512F-LABEL: pr42355_v32i16:
-; X64-AVX512F: # %bb.0:
-; X64-AVX512F-NEXT: testb $1, %dil
-; X64-AVX512F-NEXT: jne .LBB14_1
-; X64-AVX512F-NEXT: # %bb.2:
-; X64-AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm2
-; X64-AVX512F-NEXT: vmovaps %ymm1, %ymm0
-; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; X64-AVX512F-NEXT: retq
-; X64-AVX512F-NEXT: .LBB14_1:
-; X64-AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; X64-AVX512F-NEXT: retq
-;
-; X86-AVX512BW-LABEL: pr42355_v32i16:
-; X86-AVX512BW: # %bb.0:
-; X86-AVX512BW-NEXT: testb $1, {{[0-9]+}}(%esp)
-; X86-AVX512BW-NEXT: jne .LBB14_2
-; X86-AVX512BW-NEXT: # %bb.1:
-; X86-AVX512BW-NEXT: vmovaps %zmm1, %zmm0
-; X86-AVX512BW-NEXT: .LBB14_2:
-; X86-AVX512BW-NEXT: retl
+; X86-LABEL: pr42355_v32i16:
+; X86: # %bb.0:
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: jne .LBB14_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: vmovaps %zmm1, %zmm0
+; X86-NEXT: .LBB14_2:
+; X86-NEXT: retl
;
-; X64-AVX512BW-LABEL: pr42355_v32i16:
-; X64-AVX512BW: # %bb.0:
-; X64-AVX512BW-NEXT: testb $1, %dil
-; X64-AVX512BW-NEXT: jne .LBB14_2
-; X64-AVX512BW-NEXT: # %bb.1:
-; X64-AVX512BW-NEXT: vmovaps %zmm1, %zmm0
-; X64-AVX512BW-NEXT: .LBB14_2:
-; X64-AVX512BW-NEXT: retq
+; X64-LABEL: pr42355_v32i16:
+; X64: # %bb.0:
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: jne .LBB14_2
+; X64-NEXT: # %bb.1:
+; X64-NEXT: vmovaps %zmm1, %zmm0
+; X64-NEXT: .LBB14_2:
+; X64-NEXT: retq
%a = select i1 %c, <32 x i16> %x, <32 x i16> %y
ret <32 x i16> %a
}
define <64 x i8> @pr42355_v64i8(i1 %c, <64 x i8> %x, <64 x i8> %y) {
-; X86-AVX512F-LABEL: pr42355_v64i8:
-; X86-AVX512F: # %bb.0:
-; X86-AVX512F-NEXT: testb $1, {{[0-9]+}}(%esp)
-; X86-AVX512F-NEXT: jne .LBB15_1
-; X86-AVX512F-NEXT: # %bb.2:
-; X86-AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm2
-; X86-AVX512F-NEXT: vmovaps %ymm1, %ymm0
-; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; X86-AVX512F-NEXT: retl
-; X86-AVX512F-NEXT: .LBB15_1:
-; X86-AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; X86-AVX512F-NEXT: retl
-;
-; X64-AVX512F-LABEL: pr42355_v64i8:
-; X64-AVX512F: # %bb.0:
-; X64-AVX512F-NEXT: testb $1, %dil
-; X64-AVX512F-NEXT: jne .LBB15_1
-; X64-AVX512F-NEXT: # %bb.2:
-; X64-AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm2
-; X64-AVX512F-NEXT: vmovaps %ymm1, %ymm0
-; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; X64-AVX512F-NEXT: retq
-; X64-AVX512F-NEXT: .LBB15_1:
-; X64-AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; X64-AVX512F-NEXT: retq
-;
-; X86-AVX512BW-LABEL: pr42355_v64i8:
-; X86-AVX512BW: # %bb.0:
-; X86-AVX512BW-NEXT: testb $1, {{[0-9]+}}(%esp)
-; X86-AVX512BW-NEXT: jne .LBB15_2
-; X86-AVX512BW-NEXT: # %bb.1:
-; X86-AVX512BW-NEXT: vmovaps %zmm1, %zmm0
-; X86-AVX512BW-NEXT: .LBB15_2:
-; X86-AVX512BW-NEXT: retl
+; X86-LABEL: pr42355_v64i8:
+; X86: # %bb.0:
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: jne .LBB15_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: vmovaps %zmm1, %zmm0
+; X86-NEXT: .LBB15_2:
+; X86-NEXT: retl
;
-; X64-AVX512BW-LABEL: pr42355_v64i8:
-; X64-AVX512BW: # %bb.0:
-; X64-AVX512BW-NEXT: testb $1, %dil
-; X64-AVX512BW-NEXT: jne .LBB15_2
-; X64-AVX512BW-NEXT: # %bb.1:
-; X64-AVX512BW-NEXT: vmovaps %zmm1, %zmm0
-; X64-AVX512BW-NEXT: .LBB15_2:
-; X64-AVX512BW-NEXT: retq
+; X64-LABEL: pr42355_v64i8:
+; X64: # %bb.0:
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: jne .LBB15_2
+; X64-NEXT: # %bb.1:
+; X64-NEXT: vmovaps %zmm1, %zmm0
+; X64-NEXT: .LBB15_2:
+; X64-NEXT: retq
%a = select i1 %c, <64 x i8> %x, <64 x i8> %y
ret <64 x i8> %a
}
define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 {
; KNL-LABEL: trunc_wb_512:
; KNL: ## %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; KNL-NEXT: vpmovdb %zmm1, %xmm1
-; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_wb_512:
define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
; X64-AVX512VL-LABEL: test_broadcast_8i16_32i16:
; X64-AVX512VL: ## %bb.0:
-; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-AVX512VL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1
+; X64-AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
-; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X64-AVX512VL-NEXT: retq
;
; X64-AVX512BWVL-LABEL: test_broadcast_8i16_32i16:
;
; X64-AVX512DQVL-LABEL: test_broadcast_8i16_32i16:
; X64-AVX512DQVL: ## %bb.0:
-; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-AVX512DQVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1
+; X64-AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
-; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X64-AVX512DQVL-NEXT: retq
%1 = load <8 x i16>, <8 x i16> *%p
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
; X64-AVX512VL-LABEL: test_broadcast_16i8_64i8:
; X64-AVX512VL: ## %bb.0:
-; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-AVX512VL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1
+; X64-AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
-; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X64-AVX512VL-NEXT: retq
;
; X64-AVX512BWVL-LABEL: test_broadcast_16i8_64i8:
;
; X64-AVX512DQVL-LABEL: test_broadcast_16i8_64i8:
; X64-AVX512DQVL: ## %bb.0:
-; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-AVX512DQVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1
+; X64-AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
-; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X64-AVX512DQVL-NEXT: retq
%1 = load <16 x i8>, <16 x i8> *%p
%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind {
; X64-AVX512VL-LABEL: test_broadcast_16i16_32i16:
; X64-AVX512VL: ## %bb.0:
-; X64-AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; X64-AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1
+; X64-AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
-; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X64-AVX512VL-NEXT: retq
;
; X64-AVX512BWVL-LABEL: test_broadcast_16i16_32i16:
;
; X64-AVX512DQVL-LABEL: test_broadcast_16i16_32i16:
; X64-AVX512DQVL: ## %bb.0:
-; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %ymm0
+; X64-AVX512DQVL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1
+; X64-AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
-; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X64-AVX512DQVL-NEXT: retq
%1 = load <16 x i16>, <16 x i16> *%p
%2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind {
; X64-AVX512VL-LABEL: test_broadcast_32i8_64i8:
; X64-AVX512VL: ## %bb.0:
-; X64-AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; X64-AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1
+; X64-AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
-; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X64-AVX512VL-NEXT: retq
;
; X64-AVX512BWVL-LABEL: test_broadcast_32i8_64i8:
;
; X64-AVX512DQVL-LABEL: test_broadcast_32i8_64i8:
; X64-AVX512DQVL: ## %bb.0:
-; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %ymm0
+; X64-AVX512DQVL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1
+; X64-AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
-; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X64-AVX512DQVL-NEXT: retq
%1 = load <32 x i8>, <32 x i8> *%p
%2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
; KNL-LABEL: test12_v64i16:
; KNL: ## %bb.0:
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x75,0xc4]
-; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xc0]
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc0]
+; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm4 ## encoding: [0xc5,0xfd,0x75,0xe2]
+; KNL-NEXT: vpmovsxwd %ymm4, %zmm4 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xe4]
+; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0 ## encoding: [0x62,0xf2,0x5d,0x48,0x27,0xc4]
; KNL-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; KNL-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm0 ## encoding: [0xc5,0xf5,0x75,0xc5]
+; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xd2,0x01]
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc0,0x01]
+; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x75,0xc2]
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xc0]
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc0]
; KNL-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
; KNL-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10]
; KNL-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1]
-; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm0 ## encoding: [0xc5,0xed,0x75,0xc6]
+; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm0 ## encoding: [0xc5,0xf5,0x75,0xc3]
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xc0]
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc0]
; KNL-NEXT: kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0]
-; KNL-NEXT: vpcmpeqw %ymm7, %ymm3, %ymm0 ## encoding: [0xc5,0xe5,0x75,0xc7]
+; KNL-NEXT: vextracti64x4 $1, %zmm3, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xd8,0x01]
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ## encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc9,0x01]
+; KNL-NEXT: vpcmpeqw %ymm0, %ymm1, %ymm0 ## encoding: [0xc5,0xf5,0x75,0xc0]
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xc0]
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc0]
; KNL-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
;
; CHECK-KNL-LABEL: test8:
; CHECK-KNL: # %bb.0:
-; CHECK-KNL-NEXT: pushq %rbp
-; CHECK-KNL-NEXT: .cfi_def_cfa_offset 16
-; CHECK-KNL-NEXT: .cfi_offset %rbp, -16
-; CHECK-KNL-NEXT: movq %rsp, %rbp
-; CHECK-KNL-NEXT: .cfi_def_cfa_register %rbp
-; CHECK-KNL-NEXT: andq $-32, %rsp
-; CHECK-KNL-NEXT: subq $32, %rsp
-; CHECK-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm8
-; CHECK-KNL-NEXT: vmovdqa 16(%rbp), %ymm9
-; CHECK-KNL-NEXT: vpxor %xmm10, %xmm10, %xmm10
-; CHECK-KNL-NEXT: vpcmpeqb %ymm0, %ymm10, %ymm11
-; CHECK-KNL-NEXT: vpmovsxbw %xmm11, %ymm0
-; CHECK-KNL-NEXT: vpblendvb %ymm0, %ymm1, %ymm5, %ymm0
-; CHECK-KNL-NEXT: vextracti128 $1, %ymm11, %xmm1
-; CHECK-KNL-NEXT: vpmovsxbw %xmm1, %ymm1
-; CHECK-KNL-NEXT: vpblendvb %ymm1, %ymm2, %ymm6, %ymm1
-; CHECK-KNL-NEXT: vpcmpeqb %ymm10, %ymm8, %ymm5
-; CHECK-KNL-NEXT: vpmovsxbw %xmm5, %ymm2
-; CHECK-KNL-NEXT: vpblendvb %ymm2, %ymm3, %ymm7, %ymm2
+; CHECK-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm5
+; CHECK-KNL-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; CHECK-KNL-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5
+; CHECK-KNL-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0
+; CHECK-KNL-NEXT: vpmovsxbw %xmm0, %ymm6
+; CHECK-KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-KNL-NEXT: vpmovsxbw %xmm0, %ymm0
+; CHECK-KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0
+; CHECK-KNL-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0
+; CHECK-KNL-NEXT: vpmovsxbw %xmm5, %ymm1
; CHECK-KNL-NEXT: vextracti128 $1, %ymm5, %xmm3
; CHECK-KNL-NEXT: vpmovsxbw %xmm3, %ymm3
-; CHECK-KNL-NEXT: vpblendvb %ymm3, %ymm4, %ymm9, %ymm3
-; CHECK-KNL-NEXT: movq %rbp, %rsp
-; CHECK-KNL-NEXT: popq %rbp
-; CHECK-KNL-NEXT: .cfi_def_cfa %rsp, 8
+; CHECK-KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; CHECK-KNL-NEXT: vpternlogq $202, %zmm4, %zmm2, %zmm1
; CHECK-KNL-NEXT: retq
%c = icmp eq <64 x i8> %x, zeroinitializer
%ret = select <64 x i1> %c, <64 x i16> %a, <64 x i16> %b
;
; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; NoVLX-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: shll $16, %eax
; NoVLX-NEXT: orl %ecx, %eax
;
; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0
-; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vpcmpeqw 32(%rdi), %ymm1, %ymm0
+; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; NoVLX-NEXT: vpcmpeqw 32(%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: andl %edi, %eax
; NoVLX-NEXT: shrl $16, %edi
-; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
;
; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: shll $16, %eax
; NoVLX-NEXT: orl %ecx, %eax
;
; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0
-; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vpcmpgtw 32(%rdi), %ymm1, %ymm0
+; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; NoVLX-NEXT: vpcmpgtw 32(%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: andl %edi, %eax
; NoVLX-NEXT: shrl $16, %edi
-; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
;
; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2
+; NoVLX-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: shll $16, %eax
; NoVLX-NEXT: orl %ecx, %eax
;
; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; NoVLX-NEXT: vmovdqa (%rdi), %ymm2
-; NoVLX-NEXT: vmovdqa 32(%rdi), %ymm3
-; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vmovdqa 32(%rdi), %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm1
+; NoVLX-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: shll $16, %eax
; NoVLX-NEXT: orl %ecx, %eax
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: andl %edi, %eax
; NoVLX-NEXT: shrl $16, %edi
-; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; NoVLX-NEXT: vmovdqa 32(%rsi), %ymm1
+; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
;
; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm2
+; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2
+; NoVLX-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vpmaxuw %ymm3, %ymm2, %ymm0
-; NoVLX-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0
-; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: shll $16, %eax
; NoVLX-NEXT: orl %ecx, %eax
;
; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; NoVLX-NEXT: vpmaxuw (%rdi), %ymm0, %ymm2
-; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpmaxuw (%rdi), %ymm0, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm1
+; NoVLX-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vpmaxuw 32(%rdi), %ymm1, %ymm0
-; NoVLX-NEXT: vpcmpeqw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; NoVLX-NEXT: vpmaxuw 32(%rdi), %ymm0, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: andl %edi, %eax
; NoVLX-NEXT: shrl $16, %edi
-; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
;
; AVX512F-LABEL: v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
; AVX512F-NEXT: vpcmpgtw %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm6
-; AVX512F-NEXT: vpcmpgtw %ymm5, %ymm6, %ymm5
-; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm4
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm1
-; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %ecx
-; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm0
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: shll $16, %eax
;
; AVX512F-LABEL: v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
; AVX512F-NEXT: vpcmpgtb %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm6
-; AVX512F-NEXT: vpcmpgtb %ymm5, %ymm6, %ymm5
-; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm7
-; AVX512F-NEXT: vpand %xmm7, %xmm6, %xmm6
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5
+; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512F-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kmovw %k0, %ecx
-; AVX512F-NEXT: shll $16, %ecx
-; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpand %xmm4, %xmm5, %xmm0
-; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kmovw %k0, %edx
-; AVX512F-NEXT: vpmovsxbd %xmm6, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: shll $16, %eax
-; AVX512F-NEXT: orl %edx, %eax
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovmskb %ymm0, %ecx
+; AVX512F-NEXT: vpmovmskb %ymm1, %eax
; AVX512F-NEXT: shlq $32, %rax
; AVX512F-NEXT: orq %rcx, %rax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: shrl $16, %edi
; AVX512F-NEXT: kmovw %edi, %k2
-; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0
-; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
; AVX512F-NEXT: vpsrlw $15, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: ext_i32_32i16:
;
; AVX512F-LABEL: v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kmovw %k0, %ecx
-; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: shll $16, %eax
; AVX512F-NEXT: orl %ecx, %eax
;
; AVX512F-LABEL: bitcast_64i8_store:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm3
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm2
+; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm3
; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
;
; AVX512F-LABEL: bitcast_32i16_store:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm0
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-NEXT: kmovw %k1, 2(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
-; AVX512VL-LABEL: test_nt64xi8:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512VL-NEXT: vmovntdq %ymm0, (%rdi)
-; AVX512VL-NEXT: vmovntdq %ymm1, 32(%rdi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512F-LABEL: test_nt64xi8:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT: vmovntdq %ymm0, (%rdi)
-; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: test_nt64xi8:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vmovntdq %zmm0, (%rdi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_nt64xi8:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
entry:
store <64 x i8> %X, <64 x i8>* %ptr, align 64, !nontemporal !1
ret void
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
-; AVX512VL-LABEL: test_nt32xi16:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512VL-NEXT: vmovntdq %ymm0, (%rdi)
-; AVX512VL-NEXT: vmovntdq %ymm1, 32(%rdi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512F-LABEL: test_nt32xi16:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT: vmovntdq %ymm0, (%rdi)
-; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: test_nt32xi16:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vmovntdq %zmm0, (%rdi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_nt32xi16:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
entry:
store <32 x i16> %X, <32 x i16>* %ptr, align 64, !nontemporal !1
ret void
define i32 @kshiftl_v32i1_1(<32 x i16> %x, <32 x i16> %y) {
; KNL-LABEL: kshiftl_v32i1_1:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm3
-; KNL-NEXT: vpmovsxwd %ymm3, %zmm3
-; KNL-NEXT: vptestmd %zmm3, %zmm3, %k1
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; KNL-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
+; KNL-NEXT: vpmovsxwd %ymm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[15],zmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[15],zmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; KNL-NEXT: kshiftlw $1, %k2, %k1
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm2
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; KNL-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
; KNL-NEXT: vpmovsxwd %ymm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm1, %ymm1
+; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %ecx
define i64 @kshiftl_v64i1_1(<64 x i8> %x, <64 x i8> %y) {
; KNL-LABEL: kshiftl_v64i1_1:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm5
-; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm0
+; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3
+; KNL-NEXT: vpmovsxbd %xmm3, %zmm4
+; KNL-NEXT: vptestmd %zmm4, %zmm4, %k1
+; KNL-NEXT: vextracti128 $1, %ymm3, %xmm3
+; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
+; KNL-NEXT: vptestmd %zmm3, %zmm3, %k2
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm3
; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
; KNL-NEXT: vptestmd %zmm3, %zmm3, %k3
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z}
; KNL-NEXT: valignd {{.*#+}} zmm3 = zmm0[15],zmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm5[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm5 = zmm6[15],zmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm4[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm4 = zmm5[15],zmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; KNL-NEXT: kshiftlw $1, %k1, %k3
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2
-; KNL-NEXT: vextracti128 $1, %ymm2, %xmm6
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm5
+; KNL-NEXT: vpcmpeqb %ymm2, %ymm5, %ymm5
+; KNL-NEXT: vextracti128 $1, %ymm5, %xmm6
; KNL-NEXT: vpmovsxbd %xmm6, %zmm6
; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1
-; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
-; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1
+; KNL-NEXT: vpmovsxbd %xmm5, %zmm5
+; KNL-NEXT: vptestmd %zmm5, %zmm5, %k2
+; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k4
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k3}
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vptestmd %zmm5, %zmm5, %k0 {%k4}
+; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0 {%k4}
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: orl %eax, %ecx
define i32 @kshiftl_v32i1_31(<32 x i16> %x, <32 x i16> %y) {
; KNL-LABEL: kshiftl_v32i1_31:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm0
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm0
+; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
define i64 @kshiftl_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
; KNL-LABEL: kshiftl_v64i1_63:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm0
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm0
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
define i32 @kshiftr_v32i1_1(<32 x i16> %x, <32 x i16> %y) {
; KNL-LABEL: kshiftr_v32i1_1:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm3
-; KNL-NEXT: vpmovsxwd %ymm3, %zmm3
-; KNL-NEXT: vptestmd %zmm3, %zmm3, %k1
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; KNL-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
+; KNL-NEXT: vpmovsxwd %ymm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm3[0]
+; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm2[0]
; KNL-NEXT: kshiftrw $1, %k1, %k1
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm1, %ymm1
-; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k2
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm1
+; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm2
+; KNL-NEXT: vpmovsxwd %ymm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %ecx
define i64 @kshiftr_v64i1_1(<64 x i8> %x, <64 x i8> %y) {
; KNL-LABEL: kshiftr_v64i1_1:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3
-; KNL-NEXT: vextracti128 $1, %ymm3, %xmm5
-; KNL-NEXT: vpmovsxbd %xmm5, %zmm5
-; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm5
-; KNL-NEXT: vptestmd %zmm5, %zmm5, %k2
+; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT: vpcmpeqb %ymm2, %ymm3, %ymm3
+; KNL-NEXT: vextracti128 $1, %ymm3, %xmm4
+; KNL-NEXT: vpmovsxbd %xmm4, %zmm4
+; KNL-NEXT: vptestmd %zmm4, %zmm4, %k1
+; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm4
+; KNL-NEXT: vptestmd %zmm4, %zmm4, %k2
; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
; KNL-NEXT: vptestmd %zmm3, %zmm3, %k3
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm5 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm3[0]
-; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm6[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
-; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm3 = zmm3[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm6[0]
+; KNL-NEXT: valignd {{.*#+}} zmm4 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm3[0]
+; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm5[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
+; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm3 = zmm3[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm5[0]
; KNL-NEXT: kshiftrw $1, %k1, %k3
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm6
+; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm5
+; KNL-NEXT: vextracti128 $1, %ymm5, %xmm6
; KNL-NEXT: vpmovsxbd %xmm6, %zmm6
; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k2
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm1
+; KNL-NEXT: vpmovsxbd %xmm5, %zmm5
+; KNL-NEXT: vptestmd %zmm5, %zmm5, %k2
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k4
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
; KNL-NEXT: shlq $32, %rcx
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k2}
; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: vptestmd %zmm5, %zmm5, %k0 {%k1}
+; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
; KNL-NEXT: orl %edx, %eax
;
; AVX512F-LABEL: jumbled_indices32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmaddwd %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpmaddwd %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmaddwd %ymm7, %ymm3, %ymm1
-; AVX512F-NEXT: vpmaddwd %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
+; AVX512F-NEXT: vpmaddwd %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vpmaddwd %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: jumbled_indices32:
;
; AVX512F-LABEL: truncstore_v32i16_v32i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
; AVX512F-NEXT: vpmovmskb %ymm1, %eax
; AVX512F-NEXT: notl %eax
; AVX512F-NEXT: testb $1, %al
;
; AVX512F-LABEL: truncstore_v32i16_v32i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpminuw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpminuw %ymm3, %ymm0, %ymm0
}
define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(i16* %ptr) nounwind uwtable noinline ssp {
-; AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vmovaps %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512BW-NEXT: retq
+; ALL-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
+; ALL: # %bb.0:
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i16, i16* %ptr, i64 1
%ptr1 = getelementptr inbounds i16, i16* %ptr, i64 2
}
define <32 x i16> @merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
-; AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512F-NEXT: vmovaps %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512BW-NEXT: retq
+; ALL-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
+; ALL: # %bb.0:
+; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2
%ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3
}
define <64 x i8> @merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp {
-; AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vmovaps %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512BW-NEXT: retq
+; ALL-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
+; ALL: # %bb.0:
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1
%ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2
}
define <64 x i8> @merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp {
-; AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512F-NEXT: vmovaps %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512BW-NEXT: retq
+; ALL-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
+; ALL: # %bb.0:
+; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1
%ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm6
-; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5
; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm6
; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm7
; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpsubw %ymm7, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm6
; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm7
; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: vpminuw %ymm2, %ymm3, %ymm4
; AVX512F-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm5
; AVX512F-NEXT: vpternlogq $15, %zmm5, %zmm5, %zmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm7
-; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm0, %ymm8
-; AVX512F-NEXT: vpternlogq $15, %zmm8, %zmm8, %zmm8
-; AVX512F-NEXT: vpor %ymm6, %ymm8, %ymm6
+; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm6
+; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm7
+; AVX512F-NEXT: vpternlogq $15, %zmm7, %zmm7, %zmm7
; AVX512F-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
-; AVX512F-NEXT: vpsubw %ymm7, %ymm1, %ymm1
+; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1
; AVX512F-NEXT: vpsubw %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpor %ymm4, %ymm7, %ymm6
; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: vpminuw %ymm2, %ymm3, %ymm4
; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm5
; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm5, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm7, %ymm0, %ymm8
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm8, %ymm8, %ymm8
-; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm8, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm7
+; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm7, %ymm7, %ymm7
; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm4, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-FALLBACK-NEXT: vpor %ymm4, %ymm7, %ymm6
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm6
-; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5
; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm6
; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm7
; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpsubw %ymm7, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1
; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5
; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm6
; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm7
; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1
; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, <32 x i16>* %a2_addr) nounwind {
; AVX512F-LABEL: vec512_i16_signed_reg_mem:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm6
-; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm6
-; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm7
-; AVX512F-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3
-; AVX512F-NEXT: vpsubw %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpsubw %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5
+; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm6
+; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm7
+; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsubw %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vpsubw %ymm7, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_mem:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm7
+; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm3, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i16_signed_reg_mem:
define <32 x i16> @vec512_i16_signed_mem_mem(<32 x i16>* %a1_addr, <32 x i16>* %a2_addr) nounwind {
; AVX512F-LABEL: vec512_i16_signed_mem_mem:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vmovdqa (%rsi), %ymm2
-; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm3
-; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm6
-; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm6
-; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm7
-; AVX512F-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3
-; AVX512F-NEXT: vpsubw %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpsubw %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4
+; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5
+; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm6
+; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm7
+; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpsubw %ymm7, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_mem:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm2
-; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm3
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm3, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm1
+; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
+; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm7
+; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm6
-; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm6
; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm7
; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
-; AVX512F-NEXT: vpmullw %ymm7, %ymm6, %ymm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm1, %ymm8, %ymm1
; AVX512F-NEXT: vpackuswb %ymm6, %ymm1, %ymm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3
; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm6
; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm7
; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm6, %ymm6
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm6, %ymm6
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpand %ymm1, %ymm8, %ymm1
; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3
; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind {
; AVX512F-LABEL: vec512_i8_unsigned_reg_reg:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vpminub %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5
-; AVX512F-NEXT: vpternlogq $15, %zmm5, %zmm5, %zmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm7
-; AVX512F-NEXT: vpcmpeqb %ymm7, %ymm0, %ymm8
-; AVX512F-NEXT: vpternlogq $15, %zmm8, %zmm8, %zmm8
-; AVX512F-NEXT: vpor %ymm6, %ymm8, %ymm6
-; AVX512F-NEXT: vpmaxub %ymm3, %ymm2, %ymm3
+; AVX512F-NEXT: vpminub %ymm4, %ymm2, %ymm5
+; AVX512F-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm3
+; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3
+; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm6
+; AVX512F-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7
+; AVX512F-NEXT: vpternlogq $15, %zmm7, %zmm7, %zmm7
+; AVX512F-NEXT: vpmaxub %ymm4, %ymm2, %ymm4
; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
-; AVX512F-NEXT: vpsubb %ymm7, %ymm1, %ymm1
-; AVX512F-NEXT: vpsubb %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpsubb %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15],ymm6[24],ymm0[24],ymm6[25],ymm0[25],ymm6[26],ymm0[26],ymm6[27],ymm0[27],ymm6[28],ymm0[28],ymm6[29],ymm0[29],ymm6[30],ymm0[30],ymm6[31],ymm0[31]
-; AVX512F-NEXT: vpmullw %ymm7, %ymm4, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpor %ymm6, %ymm7, %ymm7
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm7[8],ymm0[8],ymm7[9],ymm0[9],ymm7[10],ymm0[10],ymm7[11],ymm0[11],ymm7[12],ymm0[12],ymm7[13],ymm0[13],ymm7[14],ymm0[14],ymm7[15],ymm0[15],ymm7[24],ymm0[24],ymm7[25],ymm0[25],ymm7[26],ymm0[26],ymm7[27],ymm0[27],ymm7[28],ymm0[28],ymm7[29],ymm0[29],ymm7[30],ymm0[30],ymm7[31],ymm0[31]
+; AVX512F-NEXT: vpmullw %ymm5, %ymm8, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[16],ymm0[16],ymm6[17],ymm0[17],ymm6[18],ymm0[18],ymm6[19],ymm0[19],ymm6[20],ymm0[20],ymm6[21],ymm0[21],ymm6[22],ymm0[22],ymm6[23],ymm0[23]
-; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
-; AVX512F-NEXT: vpmullw %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[2],ymm0[2],ymm7[3],ymm0[3],ymm7[4],ymm0[4],ymm7[5],ymm0[5],ymm7[6],ymm0[6],ymm7[7],ymm0[7],ymm7[16],ymm0[16],ymm7[17],ymm0[17],ymm7[18],ymm0[18],ymm7[19],ymm0[19],ymm7[20],ymm0[20],ymm7[21],ymm0[21],ymm7[22],ymm0[22],ymm7[23],ymm0[23]
+; AVX512F-NEXT: vpmullw %ymm7, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm1, %ymm8, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
+; AVX512F-NEXT: vpor %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
-; AVX512F-NEXT: vpmullw %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: vpminub %ymm3, %ymm2, %ymm4
; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5
; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm5, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm7, %ymm0, %ymm8
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm8, %ymm8, %ymm8
-; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm8, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7
+; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm7, %ymm7, %ymm7
; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm3, %ymm2, %ymm3
; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm3, %ymm3
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15],ymm6[24],ymm0[24],ymm6[25],ymm0[25],ymm6[26],ymm0[26],ymm6[27],ymm0[27],ymm6[28],ymm0[28],ymm6[29],ymm0[29],ymm6[30],ymm0[30],ymm6[31],ymm0[31]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm4, %ymm4
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm7, %ymm7
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm7[8],ymm0[8],ymm7[9],ymm0[9],ymm7[10],ymm0[10],ymm7[11],ymm0[11],ymm7[12],ymm0[12],ymm7[13],ymm0[13],ymm7[14],ymm0[14],ymm7[15],ymm0[15],ymm7[24],ymm0[24],ymm7[25],ymm0[25],ymm7[26],ymm0[26],ymm7[27],ymm0[27],ymm7[28],ymm0[28],ymm7[29],ymm0[29],ymm7[30],ymm0[30],ymm7[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm8, %ymm4
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm8, %ymm4
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[16],ymm0[16],ymm6[17],ymm0[17],ymm6[18],ymm0[18],ymm6[19],ymm0[19],ymm6[20],ymm0[20],ymm6[21],ymm0[21],ymm6[22],ymm0[22],ymm6[23],ymm0[23]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[2],ymm0[2],ymm7[3],ymm0[3],ymm7[4],ymm0[4],ymm7[5],ymm0[5],ymm7[6],ymm0[6],ymm7[7],ymm0[7],ymm7[16],ymm0[16],ymm7[17],ymm0[17],ymm7[18],ymm0[18],ymm7[19],ymm0[19],ymm7[20],ymm0[20],ymm7[21],ymm0[21],ymm7[22],ymm0[22],ymm7[23],ymm0[23]
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpand %ymm1, %ymm8, %ymm1
; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm4, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm8, %ymm4
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3
; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm6
-; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5
; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm6
; AVX512F-NEXT: vpminsb %ymm0, %ymm1, %ymm7
; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3
; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
-; AVX512F-NEXT: vpmullw %ymm7, %ymm6, %ymm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm7, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm0, %ymm8, %ymm0
; AVX512F-NEXT: vpackuswb %ymm6, %ymm0, %ymm0
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3
; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1
; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5
; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm6
; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm7
; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm6, %ymm6
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm6, %ymm6
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpand %ymm0, %ymm8, %ymm0
; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3
; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0
define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, <64 x i8>* %a2_addr) nounwind {
; AVX512F-LABEL: vec512_i8_signed_reg_mem:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm6
-; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5
; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm6
; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm7
; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
-; AVX512F-NEXT: vpmullw %ymm7, %ymm6, %ymm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm2, %ymm8, %ymm2
; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3
; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0
;
; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm6
; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm7
; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm6, %ymm6
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm6, %ymm6
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpand %ymm2, %ymm8, %ymm2
; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3
; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0
define <64 x i8> @vec512_i8_signed_mem_mem(<64 x i8>* %a1_addr, <64 x i8>* %a2_addr) nounwind {
; AVX512F-LABEL: vec512_i8_signed_mem_mem:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512F-NEXT: vmovdqa (%rsi), %ymm2
; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm3
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm6
-; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5
; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm6
; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm7
; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
-; AVX512F-NEXT: vpmullw %ymm7, %ymm6, %ymm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm2, %ymm8, %ymm2
; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3
; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0
;
; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_mem:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm2
; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm3
+; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm6
; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm7
; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm6, %ymm6
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm6, %ymm6
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpand %ymm2, %ymm8, %ymm2
; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3
; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0
;
; KNL-LABEL: allones_v32i16_sign:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; KNL-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
-; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2
+; KNL-NEXT: vpmovsxwd %ymm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm0
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %ecx
;
; KNL-LABEL: allzeros_v32i16_sign:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; KNL-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
-; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2
+; KNL-NEXT: vpmovsxwd %ymm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm0
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %ecx
;
; KNL-LABEL: allones_v64i8_and1:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; KNL-NEXT: vpsllw $7, %ymm0, %ymm1
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
-; KNL-NEXT: vpsllw $7, %ymm1, %ymm1
-; KNL-NEXT: vpmovmskb %ymm1, %eax
+; KNL-NEXT: vpmovmskb %ymm0, %eax
; KNL-NEXT: shlq $32, %rax
-; KNL-NEXT: vpmovmskb %ymm0, %ecx
+; KNL-NEXT: vpmovmskb %ymm1, %ecx
; KNL-NEXT: orq %rax, %rcx
; KNL-NEXT: cmpq $-1, %rcx
; KNL-NEXT: sete %al
;
; KNL-LABEL: allzeros_v64i8_and1:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; KNL-NEXT: vpsllw $7, %ymm0, %ymm1
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
-; KNL-NEXT: vpsllw $7, %ymm1, %ymm1
-; KNL-NEXT: vpmovmskb %ymm1, %eax
+; KNL-NEXT: vpmovmskb %ymm0, %eax
; KNL-NEXT: shlq $32, %rax
-; KNL-NEXT: vpmovmskb %ymm0, %ecx
+; KNL-NEXT: vpmovmskb %ymm1, %ecx
; KNL-NEXT: orq %rax, %rcx
; KNL-NEXT: sete %al
; KNL-NEXT: vzeroupper
;
; KNL-LABEL: allones_v32i16_and1:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
-; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
-; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: vpsllw $15, %ymm0, %ymm1
+; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
+; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpsllw $15, %ymm1, %ymm0
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
;
; KNL-LABEL: allzeros_v32i16_and1:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
-; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
-; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: vpsllw $15, %ymm0, %ymm1
+; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
+; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpsllw $15, %ymm1, %ymm0
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
;
; KNL-LABEL: allones_v64i8_and4:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; KNL-NEXT: vpsllw $5, %ymm0, %ymm1
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vpsllw $5, %ymm0, %ymm0
-; KNL-NEXT: vpsllw $5, %ymm1, %ymm1
-; KNL-NEXT: vpmovmskb %ymm1, %eax
+; KNL-NEXT: vpmovmskb %ymm0, %eax
; KNL-NEXT: shlq $32, %rax
-; KNL-NEXT: vpmovmskb %ymm0, %ecx
+; KNL-NEXT: vpmovmskb %ymm1, %ecx
; KNL-NEXT: orq %rax, %rcx
; KNL-NEXT: cmpq $-1, %rcx
; KNL-NEXT: sete %al
;
; KNL-LABEL: allzeros_v64i8_and4:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; KNL-NEXT: vpsllw $5, %ymm0, %ymm1
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vpsllw $5, %ymm0, %ymm0
-; KNL-NEXT: vpsllw $5, %ymm1, %ymm1
-; KNL-NEXT: vpmovmskb %ymm1, %eax
+; KNL-NEXT: vpmovmskb %ymm0, %eax
; KNL-NEXT: shlq $32, %rax
-; KNL-NEXT: vpmovmskb %ymm0, %ecx
+; KNL-NEXT: vpmovmskb %ymm1, %ecx
; KNL-NEXT: orq %rax, %rcx
; KNL-NEXT: sete %al
; KNL-NEXT: vzeroupper
;
; KNL-LABEL: allones_v32i16_and4:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; KNL-NEXT: vpsllw $13, %ymm0, %ymm0
-; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
-; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: vpsllw $13, %ymm0, %ymm1
+; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
+; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpsllw $13, %ymm1, %ymm0
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vpsllw $13, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
;
; KNL-LABEL: allzeros_v32i16_and4:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; KNL-NEXT: vpsllw $13, %ymm0, %ymm0
-; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
-; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: vpsllw $13, %ymm0, %ymm1
+; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
+; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpsllw $13, %ymm1, %ymm0
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vpsllw $13, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
-; AVX512DQ-LABEL: test_v32i16_align16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: pushq %rbp
-; AVX512DQ-NEXT: movq %rsp, %rbp
-; AVX512DQ-NEXT: andq $-32, %rsp
-; AVX512DQ-NEXT: subq $96, %rsp
-; AVX512DQ-NEXT: vmovntdqa 16(%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
-; AVX512DQ-NEXT: vmovntdqa (%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsp)
-; AVX512DQ-NEXT: vmovntdqa 48(%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
-; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
-; AVX512DQ-NEXT: vmovaps (%rsp), %ymm0
-; AVX512DQ-NEXT: vinsertf64x4 $1, {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512DQ-NEXT: movq %rbp, %rsp
-; AVX512DQ-NEXT: popq %rbp
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_v32i16_align16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vmovntdqa 48(%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: vmovntdqa 32(%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: vmovntdqa 16(%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: vmovntdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsp)
-; AVX512BW-NEXT: vmovaps (%rsp), %zmm0
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_v32i16_align16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: andq $-64, %rsp
+; AVX512-NEXT: subq $128, %rsp
+; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, (%rsp)
+; AVX512-NEXT: vmovaps (%rsp), %zmm0
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: retq
%1 = load <32 x i16>, <32 x i16>* %src, align 16, !nontemporal !1
ret <32 x i16> %1
}
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
-; AVX512DQ-LABEL: test_v64i8_align16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: pushq %rbp
-; AVX512DQ-NEXT: movq %rsp, %rbp
-; AVX512DQ-NEXT: andq $-32, %rsp
-; AVX512DQ-NEXT: subq $96, %rsp
-; AVX512DQ-NEXT: vmovntdqa 16(%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
-; AVX512DQ-NEXT: vmovntdqa (%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsp)
-; AVX512DQ-NEXT: vmovntdqa 48(%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
-; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
-; AVX512DQ-NEXT: vmovaps (%rsp), %ymm0
-; AVX512DQ-NEXT: vinsertf64x4 $1, {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512DQ-NEXT: movq %rbp, %rsp
-; AVX512DQ-NEXT: popq %rbp
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_v64i8_align16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vmovntdqa 48(%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: vmovntdqa 32(%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: vmovntdqa 16(%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: vmovntdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsp)
-; AVX512BW-NEXT: vmovaps (%rsp), %zmm0
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_v64i8_align16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: andq $-64, %rsp
+; AVX512-NEXT: subq $128, %rsp
+; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, (%rsp)
+; AVX512-NEXT: vmovaps (%rsp), %zmm0
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: retq
%1 = load <64 x i8>, <64 x i8>* %src, align 16, !nontemporal !1
ret <64 x i8> %1
}
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
-; AVX512DQ-LABEL: test_v32i16_align32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovntdqa (%rdi), %ymm0
-; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %ymm1
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_v32i16_align32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vmovntdqa 32(%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: vmovntdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqa %ymm0, (%rsp)
-; AVX512BW-NEXT: vmovaps (%rsp), %zmm0
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_v32i16_align32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: andq $-64, %rsp
+; AVX512-NEXT: subq $128, %rsp
+; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0
+; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: vmovdqa %ymm0, (%rsp)
+; AVX512-NEXT: vmovaps (%rsp), %zmm0
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: retq
%1 = load <32 x i16>, <32 x i16>* %src, align 32, !nontemporal !1
ret <32 x i16> %1
}
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
-; AVX512DQ-LABEL: test_v64i8_align32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovntdqa (%rdi), %ymm0
-; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %ymm1
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_v64i8_align32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vmovntdqa 32(%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: vmovntdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqa %ymm0, (%rsp)
-; AVX512BW-NEXT: vmovaps (%rsp), %zmm0
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_v64i8_align32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: andq $-64, %rsp
+; AVX512-NEXT: subq $128, %rsp
+; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0
+; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: vmovdqa %ymm0, (%rsp)
+; AVX512-NEXT: vmovaps (%rsp), %zmm0
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: retq
%1 = load <64 x i8>, <64 x i8>* %src, align 32, !nontemporal !1
ret <64 x i8> %1
}
;
; AVX512F-LABEL: test_arg_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpaddw %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vmovntdqa (%rdi), %ymm2
-; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm3
-; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_arg_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpaddw %ymm1, %ymm2, %ymm1
; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm2
-; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm3
-; AVX512VL-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX512VL-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512F-LABEL: test_arg_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vmovntdqa (%rdi), %ymm2
-; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm3
-; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_arg_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1
; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm2
-; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm3
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; AVX512VL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm2
; AVX512F-NEXT: vmovdqa 96(%rsi), %ymm3
-; AVX512F-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vpmaddubsw 32(%rdi), %ymm1, %ymm1
+; AVX512F-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm1
; AVX512F-NEXT: vpmaddubsw 64(%rdi), %ymm2, %ymm2
-; AVX512F-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: pmaddubsw_512:
;
; AVX512F-LABEL: mul_v64i8:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
-; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
+; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
-; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
+; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3
;
; AVX512F-LABEL: mulhuw_v64i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmulhuw %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpmulhuw %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpmulhuw %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpmulhuw %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
+; AVX512F-NEXT: vpmulhuw %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mulhuw_v64i16:
;
; AVX512F-LABEL: mulhw_v64i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmulhw %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpmulhw %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpmulhw %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpmulhw %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
+; AVX512F-NEXT: vpmulhw %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vpmulhw %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpmulhw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mulhw_v64i16:
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64
define <16 x float> @PR45443() {
-; CHECK-LABEL: PR45443:
-; CHECK: # %bb.0: # %bb
-; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm0
-; CHECK-NEXT: ret{{[l|q]}}
+; X86-LABEL: PR45443:
+; X86: # %bb.0: # %bb
+; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080]
+; X86-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm0
+; X86-NEXT: vpcmpltud {{\.LCPI.*}}{1to16}, %zmm1, %k1
+; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; X86-NEXT: vpbroadcastd {{.*#+}} ymm3 = [16777215,16777215,16777215,16777215,16777215,16777215,16777215,16777215]
+; X86-NEXT: vpand %ymm3, %ymm2, %ymm2
+; X86-NEXT: vpand %ymm3, %ymm1, %ymm1
+; X86-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; X86-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
+; X86-NEXT: vbroadcastss {{\.LCPI.*}}, %zmm0 {%k1}
+; X86-NEXT: retl
+;
+; X64-LABEL: PR45443:
+; X64: # %bb.0: # %bb
+; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080]
+; X64-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm0
+; X64-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm1, %k1
+; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; X64-NEXT: vpbroadcastd {{.*#+}} ymm3 = [16777215,16777215,16777215,16777215,16777215,16777215,16777215,16777215]
+; X64-NEXT: vpand %ymm3, %ymm2, %ymm2
+; X64-NEXT: vpand %ymm3, %ymm1, %ymm1
+; X64-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; X64-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
+; X64-NEXT: vbroadcastss {{.*}}(%rip), %zmm0 {%k1}
+; X64-NEXT: retq
bb:
%tmp = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> <i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040>, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>)
%tmp4 = tail call fast <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> <float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000>, <16 x float> undef)
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: andq $-64, %rsp
-; AVX512F-NEXT: subq $2112, %rsp # imm = 0x840
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512F-NEXT: subq $128, %rsp
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3
+; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4
; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovaps %zmm0, (%rsp)
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movzwl 1536(%rsp,%rax,2), %eax
+; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vpextrw $1, %xmm4, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $1, 1600(%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
; AVX512F-NEXT: vpextrw $2, %xmm4, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $2, 1664(%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
; AVX512F-NEXT: vpextrw $3, %xmm4, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $3, 1728(%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
; AVX512F-NEXT: vpextrw $4, %xmm4, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $4, 1792(%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
; AVX512F-NEXT: vpextrw $5, %xmm4, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $5, 1856(%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
; AVX512F-NEXT: vpextrw $6, %xmm4, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $6, 1920(%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
; AVX512F-NEXT: vpextrw $7, %xmm4, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $7, 1984(%rsp,%rax,2), %xmm0, %xmm0
-; AVX512F-NEXT: vmovd %xmm2, %eax
+; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %xmm3, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzwl 1024(%rsp,%rax,2), %eax
+; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vpextrw $1, %xmm2, %eax
+; AVX512F-NEXT: vpextrw $1, %xmm3, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $1, 1088(%rsp,%rax,2), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrw $2, %xmm2, %eax
+; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrw $2, %xmm3, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $2, 1152(%rsp,%rax,2), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrw $3, %xmm2, %eax
+; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrw $3, %xmm3, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $3, 1216(%rsp,%rax,2), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrw $4, %xmm2, %eax
+; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrw $4, %xmm3, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $4, 1280(%rsp,%rax,2), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrw $5, %xmm2, %eax
+; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrw $5, %xmm3, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $5, 1344(%rsp,%rax,2), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrw $6, %xmm2, %eax
+; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrw $6, %xmm3, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $6, 1408(%rsp,%rax,2), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrw $7, %xmm2, %eax
+; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrw $7, %xmm3, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $7, 1472(%rsp,%rax,2), %xmm4, %xmm2
-; AVX512F-NEXT: vmovd %xmm3, %eax
+; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm4, %xmm3
+; AVX512F-NEXT: vmovd %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: movzwl 512(%rsp,%rax,2), %eax
+; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vpextrw $1, %xmm3, %eax
+; AVX512F-NEXT: vpextrw $1, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $1, 576(%rsp,%rax,2), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrw $2, %xmm3, %eax
+; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrw $2, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $2, 640(%rsp,%rax,2), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrw $3, %xmm3, %eax
+; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrw $3, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $3, 704(%rsp,%rax,2), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrw $4, %xmm3, %eax
+; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrw $4, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $4, 768(%rsp,%rax,2), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrw $5, %xmm3, %eax
+; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrw $5, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $5, 832(%rsp,%rax,2), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrw $6, %xmm3, %eax
+; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512F-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4
+; AVX512F-NEXT: vpextrw $6, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $6, 896(%rsp,%rax,2), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrw $7, %xmm3, %eax
+; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512F-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4
+; AVX512F-NEXT: vpextrw $7, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $7, 960(%rsp,%rax,2), %xmm4, %xmm3
+; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512F-NEXT: vpinsrw $7, %eax, %xmm4, %xmm2
; AVX512F-NEXT: vmovd %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
; AVX512F-NEXT: vmovd %eax, %xmm4
; AVX512F-NEXT: vpextrw $1, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $1, 64(%rsp,%rax,2), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4
; AVX512F-NEXT: vpextrw $2, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $2, 128(%rsp,%rax,2), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4
; AVX512F-NEXT: vpextrw $3, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $3, 192(%rsp,%rax,2), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4
; AVX512F-NEXT: vpextrw $4, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $4, 256(%rsp,%rax,2), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4
; AVX512F-NEXT: vpextrw $5, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $5, 320(%rsp,%rax,2), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm4, %xmm4
; AVX512F-NEXT: vpextrw $6, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $6, 384(%rsp,%rax,2), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm4, %xmm4
; AVX512F-NEXT: vpextrw $7, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpinsrw $7, 448(%rsp,%rax,2), %xmm4, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm4, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: andq $-64, %rsp
-; AVX512F-NEXT: subq $4160, %rsp # imm = 0x1040
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512F-NEXT: subq $128, %rsp
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3
+; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4
; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movzbl 3072(%rsp,%rax), %eax
+; AVX512F-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512F-NEXT: andl $63, %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vpextrb $1, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $1, 3136(%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $2, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $2, 3200(%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $3, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $3, 3264(%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $4, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $4, 3328(%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $5, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $5, 3392(%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $6, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $6, 3456(%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $7, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $7, 3520(%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $8, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $8, 3584(%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $9, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $9, 3648(%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $10, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $10, 3712(%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $11, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $11, 3776(%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $12, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $12, 3840(%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $13, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $13, 3904(%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $14, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $14, 3968(%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $15, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $15, 4032(%rsp,%rax), %xmm0, %xmm0
-; AVX512F-NEXT: vmovd %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: movzbl 2048(%rsp,%rax), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512F-NEXT: vpextrb $1, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $1, 2112(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $2, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $2, 2176(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $3, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $3, 2240(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $4, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $4, 2304(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $5, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $5, 2368(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $6, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $6, 2432(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $7, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $7, 2496(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $8, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $8, 2560(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $9, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $9, 2624(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $10, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $10, 2688(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $11, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $11, 2752(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $12, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $12, 2816(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $13, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $13, 2880(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $14, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $14, 2944(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $15, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $15, 3008(%rsp,%rax), %xmm4, %xmm2
-; AVX512F-NEXT: vmovd %xmm3, %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3
+; AVX512F-NEXT: vmovd %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: movzbl 1024(%rsp,%rax), %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vpextrb $1, %xmm3, %eax
+; AVX512F-NEXT: vpextrb $1, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $1, 1088(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $2, %xmm3, %eax
+; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $2, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $2, 1152(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $3, %xmm3, %eax
+; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $3, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $3, 1216(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $4, %xmm3, %eax
+; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $4, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $4, 1280(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $5, %xmm3, %eax
+; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $5, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $5, 1344(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $6, %xmm3, %eax
+; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $6, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $6, 1408(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $7, %xmm3, %eax
+; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $7, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $7, 1472(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $8, %xmm3, %eax
+; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $8, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $8, 1536(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $9, %xmm3, %eax
+; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $9, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $9, 1600(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $10, %xmm3, %eax
+; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $10, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $10, 1664(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $11, %xmm3, %eax
+; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $11, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $11, 1728(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $12, %xmm3, %eax
+; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $12, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $12, 1792(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $13, %xmm3, %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $13, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $13, 1856(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $14, %xmm3, %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $14, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $14, 1920(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrb $15, %xmm3, %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
+; AVX512F-NEXT: vpextrb $15, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $15, 1984(%rsp,%rax), %xmm4, %xmm3
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2
; AVX512F-NEXT: vmovd %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vmovd %eax, %xmm4
; AVX512F-NEXT: vpextrb $1, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $1, 64(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4
; AVX512F-NEXT: vpextrb $2, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $2, 128(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4
; AVX512F-NEXT: vpextrb $3, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $3, 192(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4
; AVX512F-NEXT: vpextrb $4, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $4, 256(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4
; AVX512F-NEXT: vpextrb $5, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $5, 320(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4
; AVX512F-NEXT: vpextrb $6, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $6, 384(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4
; AVX512F-NEXT: vpextrb $7, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $7, 448(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4
; AVX512F-NEXT: vpextrb $8, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $8, 512(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4
; AVX512F-NEXT: vpextrb $9, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $9, 576(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4
; AVX512F-NEXT: vpextrb $10, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $10, 640(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4
; AVX512F-NEXT: vpextrb $11, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $11, 704(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
; AVX512F-NEXT: vpextrb $12, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $12, 768(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
; AVX512F-NEXT: vpextrb $13, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $13, 832(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
; AVX512F-NEXT: vpextrb $14, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $14, 896(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
; AVX512F-NEXT: vpextrb $15, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $15, 960(%rsp,%rax), %xmm4, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: andq $-64, %rsp
-; AVX512F-NEXT: subq $4160, %rsp # imm = 0x1040
+; AVX512F-NEXT: subq $128, %rsp
; AVX512F-NEXT: # kill: def $esi killed $esi def $rsi
-; AVX512F-NEXT: vpbroadcastd %esi, %zmm4
-; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm4, %zmm1
-; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm4, %zmm2
-; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm4, %zmm3
-; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm4, %zmm4
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
-; AVX512F-NEXT: movzbl 3968(%rsp,%rax), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vpextrd $1, %xmm4, %eax
-; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $1, 3904(%rsp,%rax), %xmm0, %xmm0
-; AVX512F-NEXT: vpextrd $2, %xmm4, %eax
-; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $2, 3840(%rsp,%rax), %xmm0, %xmm0
-; AVX512F-NEXT: vpextrd $3, %xmm4, %eax
-; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $3, 3776(%rsp,%rax), %xmm0, %xmm0
-; AVX512F-NEXT: vmovd %xmm5, %eax
+; AVX512F-NEXT: vpbroadcastd %esi, %zmm2
+; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm2, %zmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: vmovaps %zmm0, (%rsp)
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $4, 3712(%rsp,%rax), %xmm0, %xmm0
-; AVX512F-NEXT: vpextrd $1, %xmm5, %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vpextrd $1, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $5, 3648(%rsp,%rax), %xmm0, %xmm0
-; AVX512F-NEXT: vpextrd $2, %xmm5, %eax
+; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrd $2, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $6, 3584(%rsp,%rax), %xmm0, %xmm0
-; AVX512F-NEXT: vpextrd $3, %xmm5, %eax
-; AVX512F-NEXT: vextracti32x4 $2, %zmm4, %xmm5
+; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrd $3, %xmm1, %eax
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $7, 3520(%rsp,%rax), %xmm0, %xmm0
-; AVX512F-NEXT: vmovd %xmm5, %eax
+; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $8, 3456(%rsp,%rax), %xmm0, %xmm0
-; AVX512F-NEXT: vpextrd $1, %xmm5, %eax
+; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrd $1, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $9, 3392(%rsp,%rax), %xmm0, %xmm0
-; AVX512F-NEXT: vpextrd $2, %xmm5, %eax
+; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrd $2, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $10, 3328(%rsp,%rax), %xmm0, %xmm0
-; AVX512F-NEXT: vpextrd $3, %xmm5, %eax
-; AVX512F-NEXT: vextracti32x4 $3, %zmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrd $3, %xmm3, %eax
+; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm4
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $11, 3264(%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $12, 3200(%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vpextrd $1, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $13, 3136(%rsp,%rax), %xmm0, %xmm0
-; AVX512F-NEXT: vpextrd $2, %xmm4, %eax
-; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $14, 3072(%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm2, %zmm5
+; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm2, %zmm3
+; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm2, %zmm2
+; AVX512F-NEXT: andl $63, %esi
+; AVX512F-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpextrd $3, %xmm4, %eax
+; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm1
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $15, 3008(%rsp,%rax), %xmm0, %xmm0
-; AVX512F-NEXT: vmovd %xmm3, %eax
+; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: movzbl 2944(%rsp,%rax), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vpextrd $1, %xmm3, %eax
+; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrd $1, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $1, 2880(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $2, %xmm3, %eax
+; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrd $2, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $2, 2816(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $3, %xmm3, %eax
-; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm5
+; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrd $3, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $3, 2752(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm8
; AVX512F-NEXT: vmovd %xmm5, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $4, 2688(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm1
; AVX512F-NEXT: vpextrd $1, %xmm5, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $5, 2624(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm1, %xmm1
; AVX512F-NEXT: vpextrd $2, %xmm5, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $6, 2560(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm1, %xmm1
; AVX512F-NEXT: vpextrd $3, %xmm5, %eax
-; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm5
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm4
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $7, 2496(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vmovd %xmm5, %eax
+; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $8, 2432(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $1, %xmm5, %eax
+; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm1, %xmm1
+; AVX512F-NEXT: vpextrd $1, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $9, 2368(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $2, %xmm5, %eax
+; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm1, %xmm1
+; AVX512F-NEXT: vpextrd $2, %xmm4, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $10, 2304(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $3, %xmm5, %eax
-; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm3
+; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm1, %xmm1
+; AVX512F-NEXT: vpextrd $3, %xmm4, %eax
+; AVX512F-NEXT: vextracti32x4 $2, %zmm5, %xmm4
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $11, 2240(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm4, %eax
+; AVX512F-NEXT: andl $63, %eax
+; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm1, %xmm1
+; AVX512F-NEXT: vpextrd $1, %xmm4, %eax
+; AVX512F-NEXT: andl $63, %eax
+; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm1, %xmm1
+; AVX512F-NEXT: vpextrd $2, %xmm4, %eax
+; AVX512F-NEXT: andl $63, %eax
+; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm1, %xmm6
+; AVX512F-NEXT: vpextrd $3, %xmm4, %eax
+; AVX512F-NEXT: vextracti32x4 $3, %zmm5, %xmm1
+; AVX512F-NEXT: andl $63, %eax
+; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm4
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: andl $63, %eax
+; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4
; AVX512F-NEXT: vmovd %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $12, 2176(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm5
; AVX512F-NEXT: vpextrd $1, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $13, 2112(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm5, %xmm5
; AVX512F-NEXT: vpextrd $2, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $14, 2048(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm5, %xmm5
; AVX512F-NEXT: vpextrd $3, %xmm3, %eax
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $15, 1984(%rsp,%rax), %xmm4, %xmm3
-; AVX512F-NEXT: vmovd %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm5, %xmm5
+; AVX512F-NEXT: vmovd %xmm6, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: movzbl 1920(%rsp,%rax), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vpextrd $1, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm5, %xmm5
+; AVX512F-NEXT: vpextrd $1, %xmm6, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $1, 1856(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $2, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm5, %xmm5
+; AVX512F-NEXT: vpextrd $2, %xmm6, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $2, 1792(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $3, %xmm2, %eax
-; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm5
+; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm5, %xmm7
+; AVX512F-NEXT: vpextrd $3, %xmm6, %eax
+; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm5
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $3, 1728(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm6
; AVX512F-NEXT: vmovd %xmm5, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $4, 1664(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm6, %xmm6
; AVX512F-NEXT: vpextrd $1, %xmm5, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $5, 1600(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm6, %xmm6
; AVX512F-NEXT: vpextrd $2, %xmm5, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $6, 1536(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $3, %xmm5, %eax
-; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm5
+; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm6, %xmm6
+; AVX512F-NEXT: vmovd %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $7, 1472(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vmovd %xmm5, %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm7
+; AVX512F-NEXT: vpextrd $1, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $8, 1408(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $1, %xmm5, %eax
+; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm7, %xmm7
+; AVX512F-NEXT: vpextrd $2, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $9, 1344(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $2, %xmm5, %eax
+; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm7, %xmm7
+; AVX512F-NEXT: vpextrd $3, %xmm2, %eax
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm0
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $10, 1280(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $3, %xmm5, %eax
-; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm7, %xmm7
+; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $11, 1216(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vmovd %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm7, %xmm7
+; AVX512F-NEXT: vpextrd $1, %xmm0, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $12, 1152(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $1, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm7, %xmm7
+; AVX512F-NEXT: vpextrd $2, %xmm0, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $13, 1088(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $2, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm7, %xmm7
+; AVX512F-NEXT: vpextrd $3, %xmm0, %eax
+; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm0
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $14, 1024(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $3, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm7
+; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $15, 960(%rsp,%rax), %xmm4, %xmm2
-; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm7, %xmm7
+; AVX512F-NEXT: vpextrd $1, %xmm0, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: movzbl 896(%rsp,%rax), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vpextrd $1, %xmm1, %eax
+; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm7, %xmm7
+; AVX512F-NEXT: vpextrd $2, %xmm0, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $1, 832(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $2, %xmm1, %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7
+; AVX512F-NEXT: vpextrd $3, %xmm0, %eax
+; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm0
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $2, 768(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $3, %xmm1, %eax
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm7, %xmm2
+; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $3, 704(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vmovd %xmm5, %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: vpextrd $1, %xmm0, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $4, 640(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $1, %xmm5, %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: vpextrd $2, %xmm0, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $5, 576(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $2, %xmm5, %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: vpextrd $3, %xmm0, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $6, 512(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0
; AVX512F-NEXT: vpextrd $3, %xmm5, %eax
-; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm2
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $7, 448(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vmovd %xmm5, %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm6, %xmm3
+; AVX512F-NEXT: vmovd %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $8, 384(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $1, %xmm5, %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: vpextrd $1, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $9, 320(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: andl $63, %esi
-; AVX512F-NEXT: vpinsrb $10, 4032(%rsp,%rsi), %xmm4, %xmm4
-; AVX512F-NEXT: vpextrd $3, %xmm5, %eax
-; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm1
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: vpextrd $2, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $11, 256(%rsp,%rax), %xmm4, %xmm4
-; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: vpextrd $3, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $12, 192(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
; AVX512F-NEXT: vpextrd $1, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $13, 128(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm3
; AVX512F-NEXT: vpextrd $2, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $14, 64(%rsp,%rax), %xmm4, %xmm4
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
; AVX512F-NEXT: vpextrd $3, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm4, %xmm1
+; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm3, %xmm1
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vcvtdq2ps %zmm0, %zmm0
-; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3
-; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3
; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512F-NEXT: vcvtdq2ps %zmm2, %zmm2
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512F-NEXT: vcvtdq2ps %zmm1, %zmm1
-; AVX512F-NEXT: vmovaps %zmm1, 192(%rdi)
-; AVX512F-NEXT: vmovaps %zmm2, 128(%rdi)
-; AVX512F-NEXT: vmovaps %zmm3, 64(%rdi)
+; AVX512F-NEXT: vpmovsxbd %xmm8, %zmm3
+; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3
+; AVX512F-NEXT: vmovaps %zmm3, 192(%rdi)
+; AVX512F-NEXT: vmovaps %zmm1, 128(%rdi)
+; AVX512F-NEXT: vmovaps %zmm2, 64(%rdi)
; AVX512F-NEXT: vmovaps %zmm0, (%rdi)
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-LABEL: test_cmp_v64i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm3
+; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1
-; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
-; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm0
-; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k3
; AVX512F-NEXT: kmovw %k3, 6(%rdi)
; AVX512F-NEXT: kmovw %k2, 4(%rdi)
; AVX512DQ-LABEL: test_cmp_v64i8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: movq %rdi, %rax
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512DQ-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm3
+; AVX512DQ-NEXT: vpmovd2m %zmm3, %k0
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2
+; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1
-; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
+; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
-; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm0
-; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm0
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3
; AVX512DQ-NEXT: kmovw %k3, 6(%rdi)
; AVX512DQ-NEXT: kmovw %k2, 4(%rdi)
; AVX512F-LABEL: test_cmp_v64i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm0
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm4
+; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4
+; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm0
+; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
-; AVX512F-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k3
; AVX512F-NEXT: kmovw %k3, 6(%rdi)
; AVX512DQ-LABEL: test_cmp_v64i16:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: movq %rdi, %rax
-; AVX512DQ-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
-; AVX512DQ-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm0
+; AVX512DQ-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpmovsxwd %ymm4, %zmm4
+; AVX512DQ-NEXT: vpmovd2m %zmm4, %k0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512DQ-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
-; AVX512DQ-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm0
+; AVX512DQ-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
-; AVX512DQ-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512DQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3
; AVX512DQ-NEXT: kmovw %k3, 6(%rdi)
; AVX512F-LABEL: test_cmp_v128i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm4
-; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm0
-; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1
-; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k2
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm4
+; AVX512F-NEXT: vpmovsxbd %xmm4, %zmm5
+; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k0
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm4
+; AVX512F-NEXT: vpmovsxbd %xmm4, %zmm4
+; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm2
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k2
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k3
-; AVX512F-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm0
-; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1
-; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k4
+; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm2
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k4
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k5
-; AVX512F-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k6
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-LABEL: test_cmp_v128i8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: movq %rdi, %rax
-; AVX512DQ-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm4
-; AVX512DQ-NEXT: vpmovd2m %zmm4, %k0
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
-; AVX512DQ-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm0
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1
-; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
+; AVX512DQ-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpmovsxbd %xmm4, %zmm5
+; AVX512DQ-NEXT: vpmovd2m %zmm5, %k0
+; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4
+; AVX512DQ-NEXT: vpmovsxbd %xmm4, %zmm4
+; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512DQ-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm2
+; AVX512DQ-NEXT: vpmovd2m %zmm2, %k2
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3
-; AVX512DQ-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm0
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1
-; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4
+; AVX512DQ-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm0
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm2
+; AVX512DQ-NEXT: vpmovd2m %zmm2, %k4
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k5
-; AVX512DQ-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpsllvd %zmm4, %zmm3, %zmm3
+; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
-; AVX512F-NEXT: vpsllvd %zmm7, %zmm8, %zmm7
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %ymm5, %ymm8, %ymm9
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm9, %zmm3, %zmm3
-; AVX512F-NEXT: vpord %zmm3, %zmm7, %zmm3
-; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
-; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5
-; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpsllvd %zmm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpsubw %ymm2, %ymm8, %ymm5
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512F-NEXT: vpsllvd %zmm6, %zmm4, %zmm4
+; AVX512F-NEXT: vpmovdw %zmm4, %ymm4
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm6
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm6, %zmm7, %zmm6
+; AVX512F-NEXT: vpmovdw %zmm6, %ymm6
+; AVX512F-NEXT: vpsubw %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm5, %zmm1, %zmm1
-; AVX512F-NEXT: vpord %zmm1, %zmm4, %zmm1
+; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1
+; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm5, %ymm4
+; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
+; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm4, %zmm3, %zmm3
+; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm7, %zmm8, %zmm7
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %ymm5, %ymm8, %ymm9
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm9, %zmm3, %zmm3
-; AVX512VL-NEXT: vpord %zmm3, %zmm7, %zmm3
-; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
-; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm4, %zmm5, %zmm4
-; AVX512VL-NEXT: vpsubw %ymm2, %ymm8, %ymm5
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm6, %zmm4, %zmm4
+; AVX512VL-NEXT: vpmovdw %zmm4, %ymm4
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm6
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm6, %zmm7, %zmm6
+; AVX512VL-NEXT: vpmovdw %zmm6, %ymm6
+; AVX512VL-NEXT: vpsubw %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm5, %zmm1, %zmm1
-; AVX512VL-NEXT: vpord %zmm1, %zmm4, %zmm1
+; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1
+; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm5, %ymm4
+; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
+; AVX512VL-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v32i16:
define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm8
-; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm6
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm7
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm9
-; AVX512F-NEXT: vpsllw $5, %ymm9, %ymm10
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm7, %ymm3, %ymm7
-; AVX512F-NEXT: vpsllw $2, %ymm7, %ymm11
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512F-NEXT: vpand %ymm6, %ymm11, %ymm11
-; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm7, %ymm7
-; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm11
-; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm7, %ymm10
-; AVX512F-NEXT: vpsrlw $4, %ymm8, %ymm11
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm7, %ymm11, %ymm11
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %ymm9, %ymm12, %ymm13
-; AVX512F-NEXT: vpsllw $5, %ymm13, %ymm13
-; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm8, %ymm8
-; AVX512F-NEXT: vpsrlw $2, %ymm8, %ymm11
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpand %ymm14, %ymm11, %ymm11
-; AVX512F-NEXT: vpaddb %ymm13, %ymm13, %ymm13
-; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm8, %ymm8
-; AVX512F-NEXT: vpsrlw $1, %ymm8, %ymm11
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpand %ymm15, %ymm11, %ymm11
-; AVX512F-NEXT: vpaddb %ymm13, %ymm13, %ymm13
-; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm8, %ymm8
-; AVX512F-NEXT: vpor %ymm8, %ymm10, %ymm8
-; AVX512F-NEXT: vpxor %xmm10, %xmm10, %xmm10
-; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm9, %ymm9
-; AVX512F-NEXT: vpblendvb %ymm9, %ymm3, %ymm8, %ymm3
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm8
-; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4
-; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm5
-; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4
-; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm8
-; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6
-; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm6
-; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm5
-; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512F-NEXT: vpsubb %ymm2, %ymm12, %ymm6
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vpsrlw $4, %ymm4, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm6
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpsubb %ymm3, %ymm7, %ymm8
+; AVX512F-NEXT: vpsllw $5, %ymm8, %ymm8
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6
+; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm6, %ymm10, %ymm6
+; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6
+; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm6
; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6
; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm5
-; AVX512F-NEXT: vpand %ymm5, %ymm14, %ymm5
+; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5
; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm5
-; AVX512F-NEXT: vpand %ymm5, %ymm15, %ymm5
+; AVX512F-NEXT: vpand %ymm5, %ymm10, %ymm5
; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
-; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm10, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-NEXT: vpsllw $4, %ymm4, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm7
+; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5
+; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7
+; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm5
+; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7
+; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5
+; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm5
+; AVX512F-NEXT: vpsllw $2, %ymm5, %ymm7
+; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7
+; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm7
+; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512F-NEXT: vporq %zmm1, %zmm4, %zmm1
+; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v64i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm6
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5
-; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm9
-; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm6
-; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm10
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm11 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512VL-NEXT: vpand %ymm11, %ymm10, %ymm10
-; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
-; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6
-; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm10
-; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
-; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6
-; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm9
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT: vpand %ymm10, %ymm9, %ymm9
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %ymm5, %ymm12, %ymm13
-; AVX512VL-NEXT: vpsllw $5, %ymm13, %ymm13
-; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm9
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm14 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512VL-NEXT: vpand %ymm14, %ymm9, %ymm9
-; AVX512VL-NEXT: vpaddb %ymm13, %ymm13, %ymm13
-; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm9
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm15 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-NEXT: vpand %ymm15, %ymm9, %ymm9
-; AVX512VL-NEXT: vpaddb %ymm13, %ymm13, %ymm13
-; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm4, %ymm4
-; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4
-; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
-; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512VL-NEXT: vpand %ymm2, %ymm8, %ymm2
-; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4
-; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm7
-; AVX512VL-NEXT: vpand %ymm7, %ymm11, %ymm7
-; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4
-; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7
-; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm5
-; AVX512VL-NEXT: vpand %ymm5, %ymm10, %ymm5
-; AVX512VL-NEXT: vpsubb %ymm2, %ymm12, %ymm7
-; AVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm6
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT: vpsubb %ymm3, %ymm7, %ymm8
+; AVX512VL-NEXT: vpsllw $5, %ymm8, %ymm8
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm6
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6
+; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm6
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT: vpand %ymm6, %ymm10, %ymm6
+; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6
+; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5
+; AVX512VL-NEXT: vpsubb %ymm2, %ymm7, %ymm6
+; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm5
-; AVX512VL-NEXT: vpand %ymm5, %ymm14, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vpand %ymm5, %ymm9, %ymm5
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm5
-; AVX512VL-NEXT: vpand %ymm5, %ymm15, %ymm5
+; AVX512VL-NEXT: vpand %ymm5, %ymm10, %ymm5
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm7
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5
; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm4, %ymm1
-; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5
+; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5
+; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm5
+; AVX512VL-NEXT: vpsllw $2, %ymm5, %ymm7
+; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm7
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm7
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512VL-NEXT: vporq %zmm1, %zmm4, %zmm1
+; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v64i8:
define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm5, %ymm4, %ymm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm2, %xmm7, %xmm7
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpor %ymm3, %ymm6, %ymm3
-; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm4
-; AVX512F-NEXT: vpsrlw %xmm7, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-NEXT: vpsllw %xmm3, %ymm4, %ymm4
+; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT: vpsubw %xmm4, %xmm5, %xmm6
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm7
+; AVX512F-NEXT: vpsrlw %xmm6, %ymm7, %ymm6
+; AVX512F-NEXT: vpsubw %xmm2, %xmm5, %xmm5
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
+; AVX512F-NEXT: vpsrlw %xmm5, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1
+; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
+; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm5, %ymm4, %ymm6
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm2, %xmm7, %xmm7
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm7, %ymm3, %ymm3
-; AVX512VL-NEXT: vpor %ymm3, %ymm6, %ymm3
-; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm4
-; AVX512VL-NEXT: vpsrlw %xmm7, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm4, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT: vpsubw %xmm4, %xmm5, %xmm6
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm7
+; AVX512VL-NEXT: vpsrlw %xmm6, %ymm7, %ymm6
+; AVX512VL-NEXT: vpsubw %xmm2, %xmm5, %xmm5
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1
+; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm4, %ymm4
+; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
+; AVX512VL-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm9
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm5, %ymm4, %ymm6
-; AVX512F-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
-; AVX512F-NEXT: vpsllw %xmm5, %xmm8, %xmm7
-; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7
-; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm10
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm3
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm3, %ymm9, %ymm9
-; AVX512F-NEXT: vpsrlw %xmm3, %xmm8, %xmm6
-; AVX512F-NEXT: vpsrlw $8, %xmm6, %xmm6
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-NEXT: vpsllw %xmm3, %ymm4, %ymm4
+; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6
; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
-; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm8
-; AVX512F-NEXT: vpor %ymm8, %ymm10, %ymm8
-; AVX512F-NEXT: vpxor %xmm9, %xmm9, %xmm9
-; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm9, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4
-; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm5
-; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpsubb %xmm4, %xmm6, %xmm7
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm8
+; AVX512F-NEXT: vpsrlw %xmm7, %ymm8, %ymm8
+; AVX512F-NEXT: vpsrlw %xmm7, %xmm5, %xmm7
+; AVX512F-NEXT: vpsrlw $8, %xmm7, %xmm7
+; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7
+; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7
+; AVX512F-NEXT: vpsubb %xmm2, %xmm6, %xmm6
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpsrlw %xmm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw %xmm6, %xmm5, %xmm5
+; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5
+; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
+; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1
+; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
+; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm9
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm5, %ymm4, %ymm6
-; AVX512VL-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
-; AVX512VL-NEXT: vpsllw %xmm5, %xmm8, %xmm7
-; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7
-; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm10
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm3, %ymm9, %ymm9
-; AVX512VL-NEXT: vpsrlw %xmm3, %xmm8, %xmm6
-; AVX512VL-NEXT: vpsrlw $8, %xmm6, %xmm6
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm4, %ymm4
+; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6
; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
-; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm8
-; AVX512VL-NEXT: vpor %ymm8, %ymm10, %ymm8
-; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9
-; AVX512VL-NEXT: vpcmpeqb %ymm2, %ymm9, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4
-; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm5
-; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
-; AVX512VL-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT: vpsubb %xmm4, %xmm6, %xmm7
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm8
+; AVX512VL-NEXT: vpsrlw %xmm7, %ymm8, %ymm8
+; AVX512VL-NEXT: vpsrlw %xmm7, %xmm5, %xmm7
+; AVX512VL-NEXT: vpsrlw $8, %xmm7, %xmm7
+; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7
+; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm7
+; AVX512VL-NEXT: vpsubb %xmm2, %xmm6, %xmm6
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm6, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw %xmm6, %xmm5, %xmm5
+; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5
+; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
+; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1
+; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm4, %ymm4
+; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
+; AVX512VL-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v64i8:
define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
; AVX512F-LABEL: constant_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
-; AVX512F-NEXT: vpmulhuw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm5
-; AVX512F-NEXT: vpor %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vpmulhuw %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
+; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm5
+; AVX512F-NEXT: vpor %ymm2, %ymm5, %ymm2
+; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm3
; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
;
; AVX512VL-LABEL: constant_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
-; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vpmullw %ymm4, %ymm2, %ymm5
-; AVX512VL-NEXT: vpor %ymm3, %ymm5, %ymm3
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
+; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512VL-NEXT: vpmullw %ymm3, %ymm4, %ymm5
+; AVX512VL-NEXT: vpor %ymm2, %ymm5, %ymm2
+; AVX512VL-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm3
; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-LABEL: constant_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
-; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm4
-; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm7
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7
-; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm9
-; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7
-; AVX512F-NEXT: vpaddb %ymm9, %ymm9, %ymm10
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm7, %ymm4, %ymm4
-; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm2[8],ymm7[8],ymm2[9],ymm7[9],ymm2[10],ymm7[10],ymm2[11],ymm7[11],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15],ymm2[24],ymm7[24],ymm2[25],ymm7[25],ymm2[26],ymm7[26],ymm2[27],ymm7[27],ymm2[28],ymm7[28],ymm2[29],ymm7[29],ymm2[30],ymm7[30],ymm2[31],ymm7[31]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm3
+; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm8
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm6
+; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm9
+; AVX512F-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; AVX512F-NEXT: vpxor %xmm10, %xmm10, %xmm10
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11],ymm6[12],ymm10[12],ymm6[13],ymm10[13],ymm6[14],ymm10[14],ymm6[15],ymm10[15],ymm6[24],ymm10[24],ymm6[25],ymm10[25],ymm6[26],ymm10[26],ymm6[27],ymm10[27],ymm6[28],ymm10[28],ymm6[29],ymm10[29],ymm6[30],ymm10[30],ymm6[31],ymm10[31]
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
; AVX512F-NEXT: # ymm12 = mem[0,1,0,1]
; AVX512F-NEXT: vpmullw %ymm12, %ymm11, %ymm11
; AVX512F-NEXT: vpsrlw $8, %ymm11, %ymm11
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[16],ymm7[16],ymm2[17],ymm7[17],ymm2[18],ymm7[18],ymm2[19],ymm7[19],ymm2[20],ymm7[20],ymm2[21],ymm7[21],ymm2[22],ymm7[22],ymm2[23],ymm7[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[4],ymm10[4],ymm6[5],ymm10[5],ymm6[6],ymm10[6],ymm6[7],ymm10[7],ymm6[16],ymm10[16],ymm6[17],ymm10[17],ymm6[18],ymm10[18],ymm6[19],ymm10[19],ymm6[20],ymm10[20],ymm6[21],ymm10[21],ymm6[22],ymm10[22],ymm6[23],ymm10[23]
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
; AVX512F-NEXT: # ymm13 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmullw %ymm2, %ymm13, %ymm2
-; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vpackuswb %ymm11, %ymm2, %ymm2
-; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpmullw %ymm6, %ymm13, %ymm6
+; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6
+; AVX512F-NEXT: vpackuswb %ymm11, %ymm6, %ymm6
+; AVX512F-NEXT: vpor %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm6 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
-; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm3
-; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm5
-; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5
-; AVX512F-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm5
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15],ymm1[24],ymm7[24],ymm1[25],ymm7[25],ymm1[26],ymm7[26],ymm1[27],ymm7[27],ymm1[28],ymm7[28],ymm1[29],ymm7[29],ymm1[30],ymm7[30],ymm1[31],ymm7[31]
-; AVX512F-NEXT: vpmullw %ymm5, %ymm12, %ymm5
-; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[16],ymm7[16],ymm1[17],ymm7[17],ymm1[18],ymm7[18],ymm1[19],ymm7[19],ymm1[20],ymm7[20],ymm1[21],ymm7[21],ymm1[22],ymm7[22],ymm1[23],ymm7[23]
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3
+; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4
+; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm10[8],ymm1[9],ymm10[9],ymm1[10],ymm10[10],ymm1[11],ymm10[11],ymm1[12],ymm10[12],ymm1[13],ymm10[13],ymm1[14],ymm10[14],ymm1[15],ymm10[15],ymm1[24],ymm10[24],ymm1[25],ymm10[25],ymm1[26],ymm10[26],ymm1[27],ymm10[27],ymm1[28],ymm10[28],ymm1[29],ymm10[29],ymm1[30],ymm10[30],ymm1[31],ymm10[31]
+; AVX512F-NEXT: vpmullw %ymm4, %ymm12, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[1],ymm10[1],ymm1[2],ymm10[2],ymm1[3],ymm10[3],ymm1[4],ymm10[4],ymm1[5],ymm10[5],ymm1[6],ymm10[6],ymm1[7],ymm10[7],ymm1[16],ymm10[16],ymm1[17],ymm10[17],ymm1[18],ymm10[18],ymm1[19],ymm10[19],ymm1[20],ymm10[20],ymm1[21],ymm10[21],ymm1[22],ymm10[22],ymm1[23],ymm10[23]
; AVX512F-NEXT: vpmullw %ymm1, %ymm13, %ymm1
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm4
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
-; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm4
-; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm7
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm7
-; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm9
-; AVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm4, %ymm4
-; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7
-; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm10
-; AVX512VL-NEXT: vpblendvb %ymm10, %ymm7, %ymm4, %ymm4
-; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm2[8],ymm7[8],ymm2[9],ymm7[9],ymm2[10],ymm7[10],ymm2[11],ymm7[11],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15],ymm2[24],ymm7[24],ymm2[25],ymm7[25],ymm2[26],ymm7[26],ymm2[27],ymm7[27],ymm2[28],ymm7[28],ymm2[29],ymm7[29],ymm2[30],ymm7[30],ymm2[31],ymm7[31]
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm3
+; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm6
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm8
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm6
+; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm9
+; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; AVX512VL-NEXT: vpxor %xmm10, %xmm10, %xmm10
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11],ymm6[12],ymm10[12],ymm6[13],ymm10[13],ymm6[14],ymm10[14],ymm6[15],ymm10[15],ymm6[24],ymm10[24],ymm6[25],ymm10[25],ymm6[26],ymm10[26],ymm6[27],ymm10[27],ymm6[28],ymm10[28],ymm6[29],ymm10[29],ymm6[30],ymm10[30],ymm6[31],ymm10[31]
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
; AVX512VL-NEXT: # ymm12 = mem[0,1,0,1]
; AVX512VL-NEXT: vpmullw %ymm12, %ymm11, %ymm11
; AVX512VL-NEXT: vpsrlw $8, %ymm11, %ymm11
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[16],ymm7[16],ymm2[17],ymm7[17],ymm2[18],ymm7[18],ymm2[19],ymm7[19],ymm2[20],ymm7[20],ymm2[21],ymm7[21],ymm2[22],ymm7[22],ymm2[23],ymm7[23]
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[4],ymm10[4],ymm6[5],ymm10[5],ymm6[6],ymm10[6],ymm6[7],ymm10[7],ymm6[16],ymm10[16],ymm6[17],ymm10[17],ymm6[18],ymm10[18],ymm6[19],ymm10[19],ymm6[20],ymm10[20],ymm6[21],ymm10[21],ymm6[22],ymm10[22],ymm6[23],ymm10[23]
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
; AVX512VL-NEXT: # ymm13 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmullw %ymm2, %ymm13, %ymm2
-; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512VL-NEXT: vpackuswb %ymm11, %ymm2, %ymm2
-; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT: vpmullw %ymm6, %ymm13, %ymm6
+; AVX512VL-NEXT: vpsrlw $8, %ymm6, %ymm6
+; AVX512VL-NEXT: vpackuswb %ymm11, %ymm6, %ymm6
+; AVX512VL-NEXT: vpor %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
-; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm3
-; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm5
-; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm10, %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15],ymm1[24],ymm7[24],ymm1[25],ymm7[25],ymm1[26],ymm7[26],ymm1[27],ymm7[27],ymm1[28],ymm7[28],ymm1[29],ymm7[29],ymm1[30],ymm7[30],ymm1[31],ymm7[31]
-; AVX512VL-NEXT: vpmullw %ymm5, %ymm12, %ymm5
-; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[16],ymm7[16],ymm1[17],ymm7[17],ymm1[18],ymm7[18],ymm1[19],ymm7[19],ymm1[20],ymm7[20],ymm1[21],ymm7[21],ymm1[22],ymm7[22],ymm1[23],ymm7[23]
+; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4
+; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm10[8],ymm1[9],ymm10[9],ymm1[10],ymm10[10],ymm1[11],ymm10[11],ymm1[12],ymm10[12],ymm1[13],ymm10[13],ymm1[14],ymm10[14],ymm1[15],ymm10[15],ymm1[24],ymm10[24],ymm1[25],ymm10[25],ymm1[26],ymm10[26],ymm1[27],ymm10[27],ymm1[28],ymm10[28],ymm1[29],ymm10[29],ymm1[30],ymm10[30],ymm1[31],ymm10[31]
+; AVX512VL-NEXT: vpmullw %ymm4, %ymm12, %ymm4
+; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[1],ymm10[1],ymm1[2],ymm10[2],ymm1[3],ymm10[3],ymm1[4],ymm10[4],ymm1[5],ymm10[5],ymm1[6],ymm10[6],ymm1[7],ymm10[7],ymm1[16],ymm10[16],ymm1[17],ymm10[17],ymm1[18],ymm10[18],ymm1[19],ymm10[19],ymm1[20],ymm10[20],ymm1[21],ymm10[21],ymm1[22],ymm10[22],ymm1[23],ymm10[23]
; AVX512VL-NEXT: vpmullw %ymm1, %ymm13, %ymm1
; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
; AVX512F-LABEL: splatconstant_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpsrlw $9, %ymm3, %ymm3
-; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2
-; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512VL-NEXT: vpsrlw $9, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm2
-; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-LABEL: splatconstant_funnnel_v64i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpandn %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
-; AVX512F-NEXT: vpandn %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm3
; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm4, %ymm2
-; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpternlogq $226, %ymm1, %ymm4, %ymm0
+; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpsllvd %zmm5, %zmm2, %zmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
-; AVX512F-NEXT: vpord %zmm2, %zmm5, %zmm2
-; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
-; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm3
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT: vpsllvd %zmm4, %zmm2, %zmm4
+; AVX512F-NEXT: vpmovdw %zmm4, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
-; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm5, %zmm2, %zmm2
+; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm5, %zmm2, %zmm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %ymm3, %ymm6, %ymm3
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
-; AVX512VL-NEXT: vpord %zmm2, %zmm5, %zmm2
-; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
-; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm3
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm4, %zmm2, %zmm4
+; AVX512VL-NEXT: vpmovdw %zmm4, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm3, %ymm3
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
-; AVX512VL-NEXT: vpsubw %ymm1, %ymm6, %ymm1
+; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT: vpsubw %ymm1, %ymm4, %ymm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm5, %zmm2, %zmm2
+; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512VL-NEXT: vpsubw %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT: vpand %ymm6, %ymm1, %ymm1
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v32i16:
define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubb %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm3
+; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm8
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm9, %ymm8, %ymm8
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm8
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm10, %ymm8, %ymm8
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm5
+; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm5
+; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4
+; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm6
+; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm6
+; AVX512F-NEXT: vpand %ymm6, %ymm10, %ymm6
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6
-; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
-; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4
-; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512F-NEXT: vpandn %ymm4, %ymm6, %ymm4
-; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm7
-; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm7
-; AVX512F-NEXT: vpor %ymm4, %ymm7, %ymm4
-; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm8
-; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4
-; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
-; AVX512F-NEXT: vpandn %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm4
+; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
-; AVX512F-NEXT: vpandn %ymm3, %ymm6, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
-; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
-; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
-; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4
-; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm6, %ymm5
-; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4
-; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm7, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5
+; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-NEXT: vpsubb %ymm5, %ymm6, %ymm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm3
+; AVX512VL-NEXT: vpsrlw $2, %ymm3, %ymm8
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT: vpand %ymm9, %ymm8, %ymm8
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm8
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT: vpand %ymm10, %ymm8, %ymm8
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm5
+; AVX512VL-NEXT: vpand %ymm4, %ymm5, %ymm4
+; AVX512VL-NEXT: vpsubb %ymm1, %ymm6, %ymm5
+; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm6
+; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm6
+; AVX512VL-NEXT: vpand %ymm6, %ymm10, %ymm6
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8
-; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4
-; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
-; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm4
+; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
-; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm7, %ymm4
+; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
-; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
-; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v64i8:
define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1
; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
-; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vpsllw %xmm1, %ymm3, %ymm4
+; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm5, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm2, %zmm3, %zmm2
+; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX512VL-NEXT: vpbroadcastw %xmm1, %ymm2
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1
; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
-; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT: vpsllw %xmm1, %ymm3, %ymm4
+; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm5, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v64i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm3
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm6
+; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
+; AVX512F-NEXT: vpsubb %ymm6, %ymm7, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6
+; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm4
+; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm9
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm10, %ymm9, %ymm9
+; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm9, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm9
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm11, %ymm9, %ymm9
+; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm9, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm6
+; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT: vpsubb %ymm3, %ymm7, %ymm3
+; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3
+; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm0, %ymm5
+; AVX512F-NEXT: vpsrlw $2, %ymm5, %ymm6
+; AVX512F-NEXT: vpand %ymm6, %ymm10, %ymm6
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm6
+; AVX512F-NEXT: vpand %ymm6, %ymm11, %ymm6
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6
-; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
-; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm1, %xmm7, %xmm1
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
-; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
-; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
-; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2
+; AVX512F-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT: vpsllw %xmm1, %xmm4, %xmm4
+; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4
+; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v64i8:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm2
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm3
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm3, %ymm6
+; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7
+; AVX512VL-NEXT: vpsubb %ymm6, %ymm7, %ymm6
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpand %ymm6, %ymm8, %ymm6
+; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm4
+; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm9
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT: vpand %ymm10, %ymm9, %ymm9
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm9, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm9
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT: vpand %ymm11, %ymm9, %ymm9
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm9, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm6
+; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5
+; AVX512VL-NEXT: vpsubb %ymm3, %ymm7, %ymm3
+; AVX512VL-NEXT: vpand %ymm3, %ymm8, %ymm3
+; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm0, %ymm5
+; AVX512VL-NEXT: vpsrlw $2, %ymm5, %ymm6
+; AVX512VL-NEXT: vpand %ymm6, %ymm10, %ymm6
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsrlw $1, %ymm5, %ymm6
+; AVX512VL-NEXT: vpand %ymm6, %ymm11, %ymm6
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6
-; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
-; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm1, %xmm7, %xmm1
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
-; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
-; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
-; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2
+; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT: vpsllw %xmm1, %xmm4, %xmm4
+; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4
+; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v64i8:
; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1,2,3,4,5,6,7]
; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
-; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v32i16:
; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
; AVX512VL-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1,2,3,4,5,6,7]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
-; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX512VL-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v32i16:
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5
; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
-; AVX512F-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9
-; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
-; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmullw %ymm1, %ymm11, %ymm1
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5
+; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3
+; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4
+; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm9, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3
-; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
-; AVX512F-NEXT: vpmullw %ymm3, %ymm10, %ymm3
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
-; AVX512F-NEXT: vpmullw %ymm0, %ymm11, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31]
+; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v64i8:
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
-; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmullw %ymm10, %ymm9, %ymm9
-; AVX512VL-NEXT: vpsrlw $8, %ymm9, %ymm9
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
-; AVX512VL-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmullw %ymm1, %ymm11, %ymm1
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5
+; AVX512VL-NEXT: vpand %ymm3, %ymm5, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4
+; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpmullw %ymm6, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VL-NEXT: vpackuswb %ymm9, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
-; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3
-; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
-; AVX512VL-NEXT: vpmullw %ymm3, %ymm10, %ymm3
-; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
-; AVX512VL-NEXT: vpmullw %ymm0, %ymm11, %ymm0
+; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31]
+; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23]
+; AVX512VL-NEXT: vpmullw %ymm6, %ymm0, %ymm0
; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v64i8:
define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind {
; AVX512F-LABEL: splatconstant_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm2
-; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2
+; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpsrlw $9, %ymm2, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm2
-; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2
+; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpsrlw $9, %ymm2, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm2
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm0
+; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
+; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm7, %zmm8, %zmm7
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %ymm5, %ymm8, %ymm9
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512F-NEXT: vpsllvd %zmm9, %zmm3, %zmm3
-; AVX512F-NEXT: vpord %zmm7, %zmm3, %zmm3
-; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
-; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5
-; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpsubw %ymm2, %ymm8, %ymm5
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm6, %zmm4, %zmm4
+; AVX512F-NEXT: vpmovdw %zmm4, %ymm4
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm6
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpsllvd %zmm6, %zmm7, %zmm6
+; AVX512F-NEXT: vpmovdw %zmm6, %ymm6
+; AVX512F-NEXT: vpsubw %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpsllvd %zmm5, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0
+; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm3
+; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vpcmpeqw %ymm0, %ymm5, %ymm4
+; AVX512F-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
+; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm7, %zmm8, %zmm7
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %ymm5, %ymm8, %ymm9
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm9, %zmm3, %zmm3
-; AVX512VL-NEXT: vpord %zmm7, %zmm3, %zmm3
-; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
-; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm5, %zmm4
-; AVX512VL-NEXT: vpsubw %ymm2, %ymm8, %ymm5
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm6, %zmm4, %zmm4
+; AVX512VL-NEXT: vpmovdw %zmm4, %ymm4
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm6
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm6, %zmm7, %zmm6
+; AVX512VL-NEXT: vpmovdw %zmm6, %ymm6
+; AVX512VL-NEXT: vpsubw %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm5, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm4, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0
+; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm3
+; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vpcmpeqw %ymm0, %ymm5, %ymm4
+; AVX512VL-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512VL-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v32i16:
define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm8
-; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm6
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm7
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm9
-; AVX512F-NEXT: vpsllw $5, %ymm9, %ymm10
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm7, %ymm3, %ymm7
-; AVX512F-NEXT: vpsrlw $2, %ymm7, %ymm11
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpand %ymm6, %ymm11, %ymm11
-; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm7, %ymm11
-; AVX512F-NEXT: vpsrlw $1, %ymm11, %ymm12
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpand %ymm7, %ymm12, %ymm12
-; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm12, %ymm11, %ymm10
-; AVX512F-NEXT: vpsllw $4, %ymm8, %ymm11
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpand %ymm12, %ymm11, %ymm11
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %ymm9, %ymm13, %ymm14
-; AVX512F-NEXT: vpsllw $5, %ymm14, %ymm14
-; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm8, %ymm8
-; AVX512F-NEXT: vpsllw $2, %ymm8, %ymm11
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512F-NEXT: vpand %ymm15, %ymm11, %ymm11
-; AVX512F-NEXT: vpaddb %ymm14, %ymm14, %ymm14
-; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm8, %ymm8
-; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm11
-; AVX512F-NEXT: vpaddb %ymm14, %ymm14, %ymm14
-; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm8, %ymm8
-; AVX512F-NEXT: vpor %ymm10, %ymm8, %ymm8
-; AVX512F-NEXT: vpxor %xmm10, %xmm10, %xmm10
-; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm9, %ymm9
-; AVX512F-NEXT: vpblendvb %ymm9, %ymm3, %ymm8, %ymm3
-; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm8
-; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4
-; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm5
-; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm4
-; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm8
-; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6
-; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm6
-; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
-; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5
-; AVX512F-NEXT: vpand %ymm5, %ymm12, %ymm5
-; AVX512F-NEXT: vpsubb %ymm2, %ymm13, %ymm6
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-NEXT: vpsllw $4, %ymm4, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm6
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpsubb %ymm3, %ymm7, %ymm8
+; AVX512F-NEXT: vpsllw $5, %ymm8, %ymm8
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6
+; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm6
+; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6
+; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm6
; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6
; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm5
-; AVX512F-NEXT: vpand %ymm5, %ymm15, %ymm5
+; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5
; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm5
; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm10, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vpsrlw $4, %ymm4, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm7
+; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5
+; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7
+; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5
+; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7
+; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm5
+; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm5
+; AVX512F-NEXT: vpsrlw $2, %ymm5, %ymm7
+; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7
+; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm7
+; AVX512F-NEXT: vpand %ymm7, %ymm9, %ymm7
+; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512F-NEXT: vporq %zmm4, %zmm0, %zmm4
+; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vpcmpeqb %ymm0, %ymm3, %ymm3
+; AVX512F-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512F-NEXT: vpternlogq $202, %zmm4, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v64i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm6
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5
-; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm9
-; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm6
-; AVX512VL-NEXT: vpsrlw $2, %ymm6, %ymm10
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512VL-NEXT: vpand %ymm11, %ymm10, %ymm10
-; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
-; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6
-; AVX512VL-NEXT: vpsrlw $1, %ymm6, %ymm10
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-NEXT: vpand %ymm12, %ymm10, %ymm10
-; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
-; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6
-; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm9
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpand %ymm10, %ymm9, %ymm9
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %ymm5, %ymm13, %ymm14
-; AVX512VL-NEXT: vpsllw $5, %ymm14, %ymm14
-; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm9
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm15 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512VL-NEXT: vpand %ymm15, %ymm9, %ymm9
-; AVX512VL-NEXT: vpaddb %ymm14, %ymm14, %ymm14
-; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm4, %ymm4
-; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm9
-; AVX512VL-NEXT: vpaddb %ymm14, %ymm14, %ymm14
-; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm4, %ymm4
-; AVX512VL-NEXT: vpor %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm4
-; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512VL-NEXT: vpand %ymm2, %ymm8, %ymm2
-; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm4
-; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm7
-; AVX512VL-NEXT: vpand %ymm7, %ymm11, %ymm7
-; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm7
-; AVX512VL-NEXT: vpand %ymm7, %ymm12, %ymm7
-; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5
-; AVX512VL-NEXT: vpand %ymm5, %ymm10, %ymm5
-; AVX512VL-NEXT: vpsubb %ymm2, %ymm13, %ymm7
-; AVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm6
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT: vpsubb %ymm3, %ymm7, %ymm8
+; AVX512VL-NEXT: vpsllw $5, %ymm8, %ymm8
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm6
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6
+; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm6
+; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6
+; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5
+; AVX512VL-NEXT: vpsubb %ymm2, %ymm7, %ymm6
+; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm5
-; AVX512VL-NEXT: vpand %ymm5, %ymm15, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm5, %ymm9, %ymm5
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm5
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm7
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5
; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT: vpand %ymm5, %ymm9, %ymm5
+; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm5
+; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm5
+; AVX512VL-NEXT: vpsrlw $2, %ymm5, %ymm7
+; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm7
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsrlw $1, %ymm5, %ymm7
+; AVX512VL-NEXT: vpand %ymm7, %ymm9, %ymm7
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512VL-NEXT: vporq %zmm4, %zmm0, %zmm4
+; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vpcmpeqb %ymm0, %ymm3, %ymm3
+; AVX512VL-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vpternlogq $202, %zmm4, %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v64i8:
define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm5, %ymm4, %ymm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm2, %xmm7, %xmm7
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpor %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw %xmm5, %ymm1, %ymm4
-; AVX512F-NEXT: vpsllw %xmm7, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT: vpsubw %xmm4, %xmm5, %xmm6
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm7
+; AVX512F-NEXT: vpsllw %xmm6, %ymm7, %ymm6
+; AVX512F-NEXT: vpsubw %xmm2, %xmm5, %xmm5
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
+; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm3
+; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vpcmpeqw %ymm0, %ymm4, %ymm4
+; AVX512F-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm5, %ymm4, %ymm6
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm2, %xmm7, %xmm7
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm7, %ymm3, %ymm3
-; AVX512VL-NEXT: vpor %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlw %xmm5, %ymm1, %ymm4
-; AVX512VL-NEXT: vpsllw %xmm7, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT: vpsubw %xmm4, %xmm5, %xmm6
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm7
+; AVX512VL-NEXT: vpsllw %xmm6, %ymm7, %ymm6
+; AVX512VL-NEXT: vpsubw %xmm2, %xmm5, %xmm5
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm3
+; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vpcmpeqw %ymm0, %ymm4, %ymm4
+; AVX512VL-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512VL-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm9
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm5, %ymm4, %ymm6
-; AVX512F-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
-; AVX512F-NEXT: vpsrlw %xmm5, %xmm8, %xmm7
-; AVX512F-NEXT: vpsrlw $8, %xmm7, %xmm7
-; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7
-; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm10
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm3
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm3, %ymm9, %ymm9
-; AVX512F-NEXT: vpsllw %xmm3, %xmm8, %xmm6
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm4
+; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT: vpsrlw %xmm3, %xmm5, %xmm6
+; AVX512F-NEXT: vpsrlw $8, %xmm6, %xmm6
; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
-; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm8
-; AVX512F-NEXT: vpor %ymm10, %ymm8, %ymm8
-; AVX512F-NEXT: vpxor %xmm9, %xmm9, %xmm9
-; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm9, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4
-; AVX512F-NEXT: vpsrlw %xmm5, %ymm1, %ymm5
-; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpsubb %xmm4, %xmm6, %xmm7
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm8
+; AVX512F-NEXT: vpsllw %xmm7, %ymm8, %ymm8
+; AVX512F-NEXT: vpsllw %xmm7, %xmm5, %xmm7
+; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7
+; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7
+; AVX512F-NEXT: vpsubb %xmm2, %xmm6, %xmm6
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpsllw %xmm6, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw %xmm6, %xmm5, %xmm5
+; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
+; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm3
+; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vpcmpeqb %ymm0, %ymm4, %ymm4
+; AVX512F-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm9
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm5, %ymm4, %ymm6
-; AVX512VL-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
-; AVX512VL-NEXT: vpsrlw %xmm5, %xmm8, %xmm7
-; AVX512VL-NEXT: vpsrlw $8, %xmm7, %xmm7
-; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7
-; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm10
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm9, %ymm9
-; AVX512VL-NEXT: vpsllw %xmm3, %xmm8, %xmm6
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm4
+; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT: vpsrlw %xmm3, %xmm5, %xmm6
+; AVX512VL-NEXT: vpsrlw $8, %xmm6, %xmm6
; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
-; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm8
-; AVX512VL-NEXT: vpor %ymm10, %ymm8, %ymm8
-; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9
-; AVX512VL-NEXT: vpcmpeqb %ymm2, %ymm9, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4
-; AVX512VL-NEXT: vpsrlw %xmm5, %ymm1, %ymm5
-; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpand %ymm6, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT: vpsubb %xmm4, %xmm6, %xmm7
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm8
+; AVX512VL-NEXT: vpsllw %xmm7, %ymm8, %ymm8
+; AVX512VL-NEXT: vpsllw %xmm7, %xmm5, %xmm7
+; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7
+; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm7
+; AVX512VL-NEXT: vpsubb %xmm2, %xmm6, %xmm6
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm6, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw %xmm6, %xmm5, %xmm5
+; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
+; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm3
+; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vpcmpeqb %ymm0, %ymm4, %ymm4
+; AVX512VL-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512VL-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v64i8:
define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
; AVX512F-LABEL: constant_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
-; AVX512F-NEXT: vpmulhuw %ymm4, %ymm3, %ymm5
-; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpor %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT: vpmulhuw %ymm4, %ymm1, %ymm3
-; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
+; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
+; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm5
+; AVX512F-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm4
+; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
;
; AVX512VL-LABEL: constant_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
-; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm3, %ymm5
-; AVX512VL-NEXT: vpmullw %ymm4, %ymm2, %ymm2
-; AVX512VL-NEXT: vpor %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm1, %ymm3
-; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
+; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm5
+; AVX512VL-NEXT: vpmullw %ymm3, %ymm5, %ymm5
+; AVX512VL-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512VL-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm1, %ymm4
+; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0
; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-LABEL: constant_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
-; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm8
-; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4
-; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm9
-; AVX512F-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm7
+; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
+; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpxor %xmm9, %xmm9, %xmm9
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm3[8],ymm9[8],ymm3[9],ymm9[9],ymm3[10],ymm9[10],ymm3[11],ymm9[11],ymm3[12],ymm9[12],ymm3[13],ymm9[13],ymm3[14],ymm9[14],ymm3[15],ymm9[15],ymm3[24],ymm9[24],ymm3[25],ymm9[25],ymm3[26],ymm9[26],ymm3[27],ymm9[27],ymm3[28],ymm9[28],ymm3[29],ymm9[29],ymm3[30],ymm9[30],ymm3[31],ymm9[31]
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
; AVX512F-NEXT: vpmullw %ymm11, %ymm10, %ymm10
; AVX512F-NEXT: vpsrlw $8, %ymm10, %ymm10
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm3[0],ymm9[0],ymm3[1],ymm9[1],ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[4],ymm9[4],ymm3[5],ymm9[5],ymm3[6],ymm9[6],ymm3[7],ymm9[7],ymm3[16],ymm9[16],ymm3[17],ymm9[17],ymm3[18],ymm9[18],ymm3[19],ymm9[19],ymm3[20],ymm9[20],ymm3[21],ymm9[21],ymm3[22],ymm9[22],ymm3[23],ymm9[23]
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
; AVX512F-NEXT: # ymm13 = mem[0,1,0,1]
; AVX512F-NEXT: vpmullw %ymm13, %ymm12, %ymm12
; AVX512F-NEXT: vpsrlw $8, %ymm12, %ymm12
; AVX512F-NEXT: vpackuswb %ymm10, %ymm12, %ymm10
-; AVX512F-NEXT: vpor %ymm3, %ymm10, %ymm3
+; AVX512F-NEXT: vpor %ymm2, %ymm10, %ymm2
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
-; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3
-; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm9, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31]
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm9[8],ymm1[9],ymm9[9],ymm1[10],ymm9[10],ymm1[11],ymm9[11],ymm1[12],ymm9[12],ymm1[13],ymm9[13],ymm1[14],ymm9[14],ymm1[15],ymm9[15],ymm1[24],ymm9[24],ymm1[25],ymm9[25],ymm1[26],ymm9[26],ymm1[27],ymm9[27],ymm1[28],ymm9[28],ymm1[29],ymm9[29],ymm1[30],ymm9[30],ymm1[31],ymm9[31]
; AVX512F-NEXT: vpmullw %ymm3, %ymm11, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[4],ymm9[4],ymm1[5],ymm9[5],ymm1[6],ymm9[6],ymm1[7],ymm9[7],ymm1[16],ymm9[16],ymm1[17],ymm9[17],ymm1[18],ymm9[18],ymm1[19],ymm9[19],ymm1[20],ymm9[20],ymm1[21],ymm9[21],ymm1[22],ymm9[22],ymm1[23],ymm9[23]
; AVX512F-NEXT: vpmullw %ymm4, %ymm13, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
;
; AVX512VL-LABEL: constant_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm4
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
-; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm8
-; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4
-; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm9
-; AVX512VL-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm7
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm3[8],ymm9[8],ymm3[9],ymm9[9],ymm3[10],ymm9[10],ymm3[11],ymm9[11],ymm3[12],ymm9[12],ymm3[13],ymm9[13],ymm3[14],ymm9[14],ymm3[15],ymm9[15],ymm3[24],ymm9[24],ymm3[25],ymm9[25],ymm3[26],ymm9[26],ymm3[27],ymm9[27],ymm3[28],ymm9[28],ymm3[29],ymm9[29],ymm3[30],ymm9[30],ymm3[31],ymm9[31]
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
; AVX512VL-NEXT: # ymm11 = mem[0,1,0,1]
; AVX512VL-NEXT: vpmullw %ymm11, %ymm10, %ymm10
; AVX512VL-NEXT: vpsrlw $8, %ymm10, %ymm10
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm3[0],ymm9[0],ymm3[1],ymm9[1],ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[4],ymm9[4],ymm3[5],ymm9[5],ymm3[6],ymm9[6],ymm3[7],ymm9[7],ymm3[16],ymm9[16],ymm3[17],ymm9[17],ymm3[18],ymm9[18],ymm3[19],ymm9[19],ymm3[20],ymm9[20],ymm3[21],ymm9[21],ymm3[22],ymm9[22],ymm3[23],ymm9[23]
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
; AVX512VL-NEXT: # ymm13 = mem[0,1,0,1]
; AVX512VL-NEXT: vpmullw %ymm13, %ymm12, %ymm12
; AVX512VL-NEXT: vpsrlw $8, %ymm12, %ymm12
; AVX512VL-NEXT: vpackuswb %ymm10, %ymm12, %ymm10
-; AVX512VL-NEXT: vpor %ymm3, %ymm10, %ymm3
+; AVX512VL-NEXT: vpor %ymm2, %ymm10, %ymm2
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
-; AVX512VL-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
-; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm3
-; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm9, %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31]
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm9[8],ymm1[9],ymm9[9],ymm1[10],ymm9[10],ymm1[11],ymm9[11],ymm1[12],ymm9[12],ymm1[13],ymm9[13],ymm1[14],ymm9[14],ymm1[15],ymm9[15],ymm1[24],ymm9[24],ymm1[25],ymm9[25],ymm1[26],ymm9[26],ymm1[27],ymm9[27],ymm1[28],ymm9[28],ymm1[29],ymm9[29],ymm1[30],ymm9[30],ymm1[31],ymm9[31]
; AVX512VL-NEXT: vpmullw %ymm3, %ymm11, %ymm3
; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23]
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[4],ymm9[4],ymm1[5],ymm9[5],ymm1[6],ymm9[6],ymm1[7],ymm9[7],ymm1[16],ymm9[16],ymm1[17],ymm9[17],ymm1[18],ymm9[18],ymm1[19],ymm9[19],ymm1[20],ymm9[20],ymm1[21],ymm9[21],ymm1[22],ymm9[22],ymm1[23],ymm9[23]
; AVX512VL-NEXT: vpmullw %ymm4, %ymm13, %ymm4
; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512VL-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
; AVX512F-LABEL: splatconstant_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm3
-; AVX512F-NEXT: vpsllw $9, %ymm2, %ymm2
-; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsllw $9, %ymm2, %ymm2
-; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-LABEL: splatconstant_funnnel_v64i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpandn %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
-; AVX512F-NEXT: vpandn %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm3
; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm4, %ymm2
-; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpternlogq $226, %ymm1, %ymm4, %ymm0
+; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT: vpsubw %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpsllvd %zmm6, %zmm2, %zmm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %ymm3, %ymm7, %ymm3
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm3
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm4, %zmm2, %zmm4
+; AVX512F-NEXT: vpmovdw %zmm4, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
-; AVX512F-NEXT: vpord %zmm2, %zmm6, %zmm2
+; AVX512F-NEXT: vpsrlvd %zmm3, %zmm0, %zmm3
+; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512F-NEXT: vpsllvd %zmm5, %zmm2, %zmm2
; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm1
-; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
-; AVX512F-NEXT: vpsubw %ymm1, %ymm7, %ymm1
+; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0
+; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vpsubw %ymm3, %ymm4, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm6, %zmm2, %zmm6
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %ymm3, %ymm7, %ymm3
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm3
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm2, %zmm4
+; AVX512VL-NEXT: vpmovdw %zmm4, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm3, %ymm3
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
-; AVX512VL-NEXT: vpord %zmm2, %zmm6, %zmm2
+; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm0, %zmm3
+; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT: vpsubw %ymm1, %ymm4, %ymm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm5, %zmm2, %zmm2
; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512VL-NEXT: vpsubw %ymm1, %ymm4, %ymm1
-; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
-; AVX512VL-NEXT: vpsubw %ymm1, %ymm7, %ymm1
+; AVX512VL-NEXT: vpand %ymm6, %ymm1, %ymm1
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0
+; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v32i16:
define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6
-; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
-; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5
; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2
-; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512F-NEXT: vpandn %ymm4, %ymm7, %ymm4
+; AVX512F-NEXT: vpsubb %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm3
; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm8
-; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm8
-; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4
-; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT: vpand %ymm9, %ymm8, %ymm8
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm8
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5
+; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm5
+; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4
+; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm6
+; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm6
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4
-; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm9
-; AVX512F-NEXT: vpor %ymm4, %ymm9, %ymm4
-; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
-; AVX512F-NEXT: vpandn %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
-; AVX512F-NEXT: vpandn %ymm3, %ymm7, %ymm3
-; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
-; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3
-; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
-; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4
+; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4
-; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm6, %ymm5
-; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vpsubb %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm5
-; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm7
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512VL-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm7
-; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5
+; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-NEXT: vpsubb %ymm5, %ymm6, %ymm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm9
-; AVX512VL-NEXT: vpor %ymm5, %ymm9, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm2
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm3
+; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm8
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT: vpand %ymm9, %ymm8, %ymm8
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5
-; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm5
-; AVX512VL-NEXT: vpsubb %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT: vpand %ymm4, %ymm5, %ymm4
+; AVX512VL-NEXT: vpsubb %ymm1, %ymm6, %ymm5
+; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm6
+; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm6
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $2, %ymm2, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $1, %ymm2, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT: vpand %ymm4, %ymm8, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
-; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
-; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm8, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
-; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
-; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm4, %ymm8, %ymm4
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v64i8:
define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1
; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
-; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm3, %ymm4
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpsllvd %zmm5, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT: vpsllvd %zmm2, %zmm3, %zmm2
+; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT: vpbroadcastw %xmm1, %ymm2
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1
; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
-; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm3, %ymm4
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm5, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v64i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm3
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm6
+; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
+; AVX512F-NEXT: vpsubb %ymm6, %ymm7, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6
+; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm4
+; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm9
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT: vpand %ymm10, %ymm9, %ymm9
+; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm9, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm9
+; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm9, %ymm4, %ymm4
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6
+; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT: vpsubb %ymm3, %ymm7, %ymm3
+; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3
+; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm0, %ymm5
+; AVX512F-NEXT: vpsllw $2, %ymm5, %ymm6
+; AVX512F-NEXT: vpand %ymm6, %ymm10, %ymm6
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm6
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6
-; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
-; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm1, %xmm7, %xmm1
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
-; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
-; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
-; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
+; AVX512F-NEXT: vpsrlw $8, %xmm4, %xmm4
+; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4
+; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v64i8:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm2
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm3
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm3, %ymm6
+; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7
+; AVX512VL-NEXT: vpsubb %ymm6, %ymm7, %ymm6
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpand %ymm6, %ymm8, %ymm6
+; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm4
+; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm9
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT: vpand %ymm10, %ymm9, %ymm9
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm9, %ymm4, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm9
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm9, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6
+; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5
+; AVX512VL-NEXT: vpsubb %ymm3, %ymm7, %ymm3
+; AVX512VL-NEXT: vpand %ymm3, %ymm8, %ymm3
+; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm0, %ymm5
+; AVX512VL-NEXT: vpsllw $2, %ymm5, %ymm6
+; AVX512VL-NEXT: vpand %ymm6, %ymm10, %ymm6
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm6
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6
-; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
-; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm1, %xmm7, %xmm1
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
-; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
-; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
-; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
+; AVX512VL-NEXT: vpsrlw $8, %xmm4, %xmm4
+; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4
+; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v64i8:
; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1,2,3,4,5,6,7]
; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
-; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
+; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v32i16:
; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
; AVX512VL-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1,2,3,4,5,6,7]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
-; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
+; AVX512VL-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v32i16:
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5
; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
-; AVX512F-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9
-; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
-; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmullw %ymm1, %ymm11, %ymm1
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5
+; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3
+; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4
+; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm9, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3
-; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
-; AVX512F-NEXT: vpmullw %ymm3, %ymm10, %ymm3
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
-; AVX512F-NEXT: vpmullw %ymm0, %ymm11, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31]
+; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v64i8:
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
-; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmullw %ymm10, %ymm9, %ymm9
-; AVX512VL-NEXT: vpsrlw $8, %ymm9, %ymm9
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
-; AVX512VL-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmullw %ymm1, %ymm11, %ymm1
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5
+; AVX512VL-NEXT: vpand %ymm3, %ymm5, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4
+; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpmullw %ymm6, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VL-NEXT: vpackuswb %ymm9, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
-; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3
-; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
-; AVX512VL-NEXT: vpmullw %ymm3, %ymm10, %ymm3
-; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
-; AVX512VL-NEXT: vpmullw %ymm0, %ymm11, %ymm0
+; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31]
+; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23]
+; AVX512VL-NEXT: vpmullw %ymm6, %ymm0, %ymm0
; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v64i8:
define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind {
; AVX512F-LABEL: splatconstant_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2
-; AVX512F-NEXT: vpsllw $9, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
+; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllw $9, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2
-; AVX512VL-NEXT: vpsllw $9, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2
+; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsllw $9, %ymm2, %ymm2
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm0
+; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
; AVX512F-NEXT: vpsrlw $15, %ymm3, %ymm4
; AVX512F-NEXT: vpsraw $1, %ymm3, %ymm3
; AVX512F-NEXT: vpaddw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm4
+; AVX512F-NEXT: vpsubw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm3
; AVX512F-NEXT: vpsraw $1, %ymm2, %ymm2
; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm3
+; AVX512F-NEXT: vpsubw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: test_div7_32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1
-; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_div7_32i16:
; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4
; AVX512F-NEXT: vpaddw %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm4
+; AVX512F-NEXT: vpsubw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
-; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm3
+; AVX512F-NEXT: vpsubw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512F-LABEL: testv32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
-; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm3
-; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
+; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4
+; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpsllw $8, %ymm2, %ymm4
+; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm4
+; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm2
-; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
+; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: testv32i16:
;
; AVX512VPOPCNTDQ-NOBW-LABEL: testv32i16:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm1, %zmm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: retq
;
; AVX512VPOPCNTDQ-BW-LABEL: testv32i16:
; AVX512F-LABEL: icmp_v32i16_v32i1:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
; AVX512F-NEXT: kandw %k1, %k0, %k0
;
; AVX512DQ-LABEL: test_v64i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpmullw %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
;
; AVX512DQVL-LABEL: test_v64i16:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1
-; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1
+; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT: vpmullw %xmm0, %xmm3, %xmm0
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512DQ-NEXT: vpmullw %xmm0, %xmm1, %xmm0
; AVX512DQ-NEXT: vpmullw %xmm0, %xmm2, %xmm0
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQVL-NEXT: vpmullw %xmm0, %xmm3, %xmm0
+; AVX512DQVL-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512DQVL-NEXT: vpmullw %xmm0, %xmm1, %xmm0
; AVX512DQVL-NEXT: vpmullw %xmm0, %xmm2, %xmm0
+; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
;
; AVX512DQ-LABEL: test_v128i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
+; AVX512DQ-NEXT: vpmullw %ymm3, %ymm5, %ymm3
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm4
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
-; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT: vpmullw %ymm3, %ymm5, %ymm3
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm4
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpmullw %ymm3, %ymm5, %ymm3
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
-; AVX512DQ-NEXT: vpmullw %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
+; AVX512DQ-NEXT: vpmullw %ymm2, %ymm4, %ymm2
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm1
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm4, %xmm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm1
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm3, %xmm1
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-NEXT: vpmullw %xmm1, %xmm2, %xmm1
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm1
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm1
-; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
;
; AVX512DQVL-LABEL: test_v128i8:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
+; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm5, %ymm3
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4
-; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
-; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4
+; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm5, %ymm3
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm5, %ymm3
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
-; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1
+; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
+; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm4, %ymm2
+; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vextracti128 $1, %ymm4, %xmm1
-; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm4, %xmm1
-; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512DQVL-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX512DQVL-NEXT: vextracti128 $1, %ymm3, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm3, %xmm1
+; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm2, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm1
+; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm1
-; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX512DQVL-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
; AVX512F-NEXT: korw %k1, %k0, %k0
; AVX512F-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
; AVX512F-NEXT: kxorw %k1, %k0, %k0
define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512F-LABEL: var_rotate_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpsllvd %zmm5, %zmm2, %zmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3
+; AVX512F-NEXT: vpsubw %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT: vpsubw %ymm1, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpsllvd %zmm1, %zmm5, %zmm1
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
-; AVX512F-NEXT: vpord %zmm2, %zmm5, %zmm2
-; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
-; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
-; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0
+; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm2, %zmm5, %zmm2
+; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm3, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_rotate_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm5, %zmm2, %zmm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %ymm3, %ymm6, %ymm3
+; AVX512VL-NEXT: vpsubw %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT: vpsubw %ymm1, %ymm2, %ymm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm1, %zmm5, %zmm1
+; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
-; AVX512VL-NEXT: vpord %zmm2, %zmm5, %zmm2
-; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
-; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
-; AVX512VL-NEXT: vpsubw %ymm1, %ymm6, %ymm1
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0
+; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm5, %zmm2
+; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_rotate_v32i16:
define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512F-LABEL: var_rotate_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6
-; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
-; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4
-; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512F-NEXT: vpandn %ymm4, %ymm6, %ymm4
-; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm7
-; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm7
-; AVX512F-NEXT: vpor %ymm4, %ymm7, %ymm4
-; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm8
-; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4
-; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
-; AVX512F-NEXT: vpandn %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
-; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpsubb %ymm1, %ymm3, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vpsubb %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
+; AVX512F-NEXT: vpsllw $4, %ymm5, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm6
+; AVX512F-NEXT: vpsllw $2, %ymm6, %ymm8
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT: vpand %ymm9, %ymm8, %ymm8
+; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm8, %ymm6, %ymm6
+; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm8
+; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm8, %ymm6, %ymm4
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6
+; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
-; AVX512F-NEXT: vpandn %ymm3, %ymm6, %ymm3
-; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
-; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm6, %ymm0, %ymm6
+; AVX512F-NEXT: vpsllw $2, %ymm6, %ymm7
+; AVX512F-NEXT: vpand %ymm7, %ymm9, %ymm7
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
-; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
-; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm7
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm7, %ymm6, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512F-NEXT: vpsrlw $4, %ymm5, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm4
+; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4
+; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4
+; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_rotate_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4
-; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm6, %ymm5
-; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4
-; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm7, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8
-; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4
-; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
-; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT: vpsubb %ymm1, %ymm3, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512VL-NEXT: vpsubb %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm5
+; AVX512VL-NEXT: vpsllw $4, %ymm5, %ymm6
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm6
+; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm8
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT: vpand %ymm9, %ymm8, %ymm8
+; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm8, %ymm6, %ymm6
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm8
+; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm8, %ymm6, %ymm4
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6
+; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6
; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
-; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
-; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm7, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm6, %ymm0, %ymm6
+; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm7
+; AVX512VL-NEXT: vpand %ymm7, %ymm9, %ymm7
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
-; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
-; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm7
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm7, %ymm6, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512VL-NEXT: vpsrlw $4, %ymm5, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4
+; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm4, %ymm8, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_rotate_v64i8:
define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512F-LABEL: splatvar_rotate_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm4
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1
+; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1
; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
-; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm5
+; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpsrlw %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_rotate_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm4
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1
+; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1
; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
-; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm5
+; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm2
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpsrlw %xmm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_rotate_v32i16:
define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512F-LABEL: splatvar_rotate_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm4
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6
-; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
-; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm1, %xmm7, %xmm1
+; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
-; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
-; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
-; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm5
+; AVX512F-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsllw %xmm2, %xmm6, %xmm7
+; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7
+; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw %xmm1, %xmm6, %xmm1
+; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
+; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpsrlw %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw %xmm4, %xmm6, %xmm3
+; AVX512F-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX512F-NEXT: vpbroadcastb %xmm3, %ymm3
+; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_rotate_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm4
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6
-; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
-; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm1, %xmm7, %xmm1
+; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
-; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
-; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
-; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm5
+; AVX512VL-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
+; AVX512VL-NEXT: vpsllw %xmm2, %xmm6, %xmm7
+; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7
+; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm2
+; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm6, %xmm1
+; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1
+; AVX512VL-NEXT: vpand %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpsrlw %xmm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw %xmm4, %xmm6, %xmm3
+; AVX512VL-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX512VL-NEXT: vpbroadcastb %xmm3, %ymm3
+; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_rotate_v64i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
-; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm4
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_rotate_v32i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
-; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
-; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
-; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm4
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_rotate_v32i16:
; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5
; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
-; AVX512F-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9
-; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
-; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmullw %ymm1, %ymm11, %ymm1
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5
+; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3
+; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4
+; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm9, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3
-; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
-; AVX512F-NEXT: vpmullw %ymm3, %ymm10, %ymm3
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
-; AVX512F-NEXT: vpmullw %ymm0, %ymm11, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31]
+; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_rotate_v64i8:
; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
-; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmullw %ymm10, %ymm9, %ymm9
-; AVX512VL-NEXT: vpsrlw $8, %ymm9, %ymm9
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
-; AVX512VL-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmullw %ymm1, %ymm11, %ymm1
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5
+; AVX512VL-NEXT: vpand %ymm3, %ymm5, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4
+; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpmullw %ymm6, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VL-NEXT: vpackuswb %ymm9, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
-; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3
-; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
-; AVX512VL-NEXT: vpmullw %ymm3, %ymm10, %ymm3
-; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
-; AVX512VL-NEXT: vpmullw %ymm0, %ymm11, %ymm0
+; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31]
+; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23]
+; AVX512VL-NEXT: vpmullw %ymm6, %ymm0, %ymm0
; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_rotate_v64i8:
define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: splatconstant_rotate_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm2
-; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2
-; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $9, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm2
-; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2
-; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $9, %ymm2, %ymm2
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_v32i16:
; AVX512F-LABEL: splatconstant_rotate_v64i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
+; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_v64i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
-; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm0
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_v64i8:
define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: splatconstant_rotate_mask_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
-; AVX512F-NEXT: vpsrlw $11, %ymm1, %ymm3
-; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm3
-; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $11, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
-; AVX512VL-NEXT: vpsrlw $11, %ymm1, %ymm3
-; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm3
-; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $11, %ymm2, %ymm2
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VL-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16:
; AVX512F-LABEL: splatconstant_rotate_mask_v64i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
+; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39]
-; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
-; AVX512F-NEXT: vpandn %ymm4, %ymm3, %ymm4
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm1
+; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
-; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39]
-; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm3, %ymm0
-; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm1
+; AVX512VL-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8:
;
; AVX512F-LABEL: sext_32i8_to_32i16:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sext_32i8_to_32i16:
define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512DQ-NEXT: vpmovsxwd %ymm2, %zmm2
-; AVX512DQ-NEXT: vpsravd %zmm3, %zmm2, %zmm2
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm3
+; AVX512DQ-NEXT: vpsravd %zmm2, %zmm3, %zmm2
; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v32i16:
define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512DQ-NEXT: vpsllw $5, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15],ymm0[24],ymm4[24],ymm0[25],ymm4[25],ymm0[26],ymm4[26],ymm0[27],ymm4[27],ymm0[28],ymm4[28],ymm0[29],ymm4[29],ymm0[30],ymm4[30],ymm0[31],ymm4[31]
; AVX512DQ-NEXT: vpsraw $4, %ymm5, %ymm6
-; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5
+; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5
; AVX512DQ-NEXT: vpsraw $2, %ymm5, %ymm6
-; AVX512DQ-NEXT: vpaddw %ymm4, %ymm4, %ymm4
-; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5
-; AVX512DQ-NEXT: vpsraw $1, %ymm5, %ymm6
-; AVX512DQ-NEXT: vpaddw %ymm4, %ymm4, %ymm4
-; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm4
-; AVX512DQ-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23]
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
-; AVX512DQ-NEXT: vpsraw $4, %ymm2, %ymm5
-; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpsraw $2, %ymm2, %ymm5
; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpsraw $1, %ymm2, %ymm5
+; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5
+; AVX512DQ-NEXT: vpsraw $1, %ymm5, %ymm6
; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm3
+; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[16],ymm4[16],ymm0[17],ymm4[17],ymm0[18],ymm4[18],ymm0[19],ymm4[19],ymm0[20],ymm4[20],ymm0[21],ymm4[21],ymm0[22],ymm4[22],ymm0[23],ymm4[23]
+; AVX512DQ-NEXT: vpsraw $4, %ymm4, %ymm5
+; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT: vpsraw $2, %ymm4, %ymm5
+; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT: vpsraw $1, %ymm4, %ymm5
+; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm2
; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512DQ-NEXT: vpsraw %xmm1, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsraw %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-LABEL: constant_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm1
; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpsravd %zmm2, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpsravd %zmm2, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v32i16:
define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-LABEL: splatconstant_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vpsraw $3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpsraw $3, %ymm0, %ymm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vpsraw $3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_shift_v32i16:
define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512DQ-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT: vpsrlvd %zmm2, %zmm3, %zmm2
; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v32i16:
define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm4
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpsrlw $2, %ymm3, %ymm4
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT: vpsrlw $4, %ymm2, %ymm3
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm5
+; AVX512DQ-NEXT: vpsllw $5, %ymm5, %ymm5
+; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpsrlw $2, %ymm2, %ymm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512DQ-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpsrlw $1, %ymm3, %ymm4
+; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpsrlw $1, %ymm2, %ymm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512DQ-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm3
define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-LABEL: splatconstant_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vpsrlw $3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_shift_v32i16:
define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512DQ-NEXT: vpsllvd %zmm3, %zmm2, %zmm2
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT: vpsllvd %zmm2, %zmm3, %zmm2
; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v32i16:
define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512DQ-NEXT: vpsllw $4, %ymm3, %ymm4
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpsllw $2, %ymm3, %ymm4
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT: vpsllw $4, %ymm2, %ymm3
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm5
+; AVX512DQ-NEXT: vpsllw $5, %ymm5, %ymm5
+; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpsllw $2, %ymm2, %ymm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512DQ-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpaddb %ymm3, %ymm3, %ymm4
-; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm3
+; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm3
define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512DQ-NEXT: vpsllw %xmm1, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512DQ-NEXT: vpsllw %xmm1, %ymm2, %ymm2
; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512DQ-NEXT: vpsllw %xmm1, %xmm3, %xmm3
define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-LABEL: splatconstant_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vpsllw $3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_shift_v32i16:
; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19]
; KNL-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31]
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u>
; KNL-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm3
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7],ymm3[8,9,10,11,12,13,14],ymm0[15]
; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19]
define <32 x i16> @shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31:
; KNL: ## %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
+; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
-; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31:
define <32 x i16> @shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28:
; KNL: ## %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
+; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
-; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28:
define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
; KNL: ## %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
+; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
-; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
; KNL: ## %bb.0:
; KNL-NEXT: vpsrad $25, %zmm0, %zmm0
; KNL-NEXT: vpsrad $25, %zmm1, %zmm1
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; KNL-NEXT: vpackssdw %ymm3, %ymm2, %ymm2
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; KNL-NEXT: vpackssdw %ymm2, %ymm3, %ymm2
; KNL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; KNL-NEXT: retq
; KNL: ## %bb.0:
; KNL-NEXT: vpsrld $25, %zmm0, %zmm0
; KNL-NEXT: vpsrld $25, %zmm1, %zmm1
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; KNL-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; KNL-NEXT: vpackusdw %ymm2, %ymm3, %ymm2
; KNL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; KNL-NEXT: retq
define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
; KNL: ## %bb.0:
-; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; KNL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
define <32 x i16> @shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
; KNL: ## %bb.0:
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
define <32 x i16> @shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz:
; KNL: ## %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[14,15],zero,zero,ymm1[10,11],zero,zero,ymm1[6,7],zero,zero,ymm1[2,3],zero,zero,ymm1[30,31],zero,zero,ymm1[26,27],zero,zero,ymm1[22,23],zero,zero,ymm1[20,21],zero,zero
-; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15],zero,zero,ymm0[10,11],zero,zero,ymm0[6,7],zero,zero,ymm0[2,3],zero,zero,ymm0[30,31],zero,zero,ymm0[26,27],zero,zero,ymm0[22,23],zero,zero,ymm0[18,19],zero,zero
-; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[14,15],zero,zero,ymm0[10,11],zero,zero,ymm0[6,7],zero,zero,ymm0[2,3],zero,zero,ymm0[30,31],zero,zero,ymm0[26,27],zero,zero,ymm0[22,23],zero,zero,ymm0[18,19],zero,zero
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15],zero,zero,ymm0[10,11],zero,zero,ymm0[6,7],zero,zero,ymm0[2,3],zero,zero,ymm0[30,31],zero,zero,ymm0[26,27],zero,zero,ymm0[22,23],zero,zero,ymm0[20,21],zero,zero
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz:
define <64 x i8> @shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
;
; AVX512DQ-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512F-NEXT: vpsrldq {{.*#+}} ymm1 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
;
; AVX512DQ-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
; AVX512F-NEXT: retq
;
;
; AVX512DQ-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
; AVX512DQ-NEXT: retq
;
define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
;
; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
;
; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
define <64 x i8> @shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
;
; AVX512DQ-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
-; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
;
; AVX512DQ-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
-; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
+; AVX512F-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
;
; AVX512DQ-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsrad $25, %zmm0, %zmm0
; AVX512F-NEXT: vpsrad $25, %zmm1, %zmm1
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpackssdw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpackssdw %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsrad $25, %zmm0, %zmm0
; AVX512DQ-NEXT: vpsrad $25, %zmm1, %zmm1
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512DQ-NEXT: vpackssdw %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT: vpackssdw %ymm2, %ymm3, %ymm2
; AVX512DQ-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsrad $25, %zmm0, %zmm0
; AVX512F-NEXT: vpsrad $25, %zmm1, %zmm1
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpackssdw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vpacksswb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpacksswb %ymm0, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsrad $25, %zmm0, %zmm0
; AVX512DQ-NEXT: vpsrad $25, %zmm1, %zmm1
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512DQ-NEXT: vpackssdw %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpackssdw %ymm1, %ymm0, %ymm2
; AVX512DQ-NEXT: vpacksswb %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpacksswb %ymm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsrld $25, %zmm0, %zmm0
; AVX512F-NEXT: vpsrld $25, %zmm1, %zmm1
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpackusdw %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsrld $25, %zmm0, %zmm0
; AVX512DQ-NEXT: vpsrld $25, %zmm1, %zmm1
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512DQ-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT: vpackusdw %ymm2, %ymm3, %ymm2
; AVX512DQ-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsrld $25, %zmm0, %zmm0
; AVX512F-NEXT: vpsrld $25, %zmm1, %zmm1
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vpackuswb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpackuswb %ymm0, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsrld $25, %zmm0, %zmm0
; AVX512DQ-NEXT: vpsrld $25, %zmm1, %zmm1
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512DQ-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpackusdw %ymm1, %ymm0, %ymm2
; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpackuswb %ymm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) {
; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512F-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
;
; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) {
; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm3
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
;
; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm3
+; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm1
-; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm3, %ymm1
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16(<32 x i16> %a, <32 x i16> %c, <32 x i16> %d) {
; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
-; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm5, %ymm0
+; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm4
+; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4
+; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
-; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm6
-; AVX512F-NEXT: vptestmd %zmm6, %zmm6, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
+; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512F-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm5
-; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm5, %ymm0
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm4
+; AVX512VL-NEXT: vpmovsxwd %ymm4, %zmm4
+; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; AVX512VL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
-; AVX512VL-NEXT: vpermi2d %zmm0, %zmm5, %zmm6
-; AVX512VL-NEXT: vptestmd %zmm6, %zmm6, %k1
+; AVX512VL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
+; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VL-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512VL-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) {
; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
-; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm5
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
-; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6
-; AVX512F-NEXT: vptestmd %zmm6, %zmm6, %k1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4
+; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512F-NEXT: vpternlogq $202, %zmm3, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm3, %ymm5
; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
-; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm6
-; AVX512VL-NEXT: vptestmd %zmm6, %zmm6, %k1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4
+; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VL-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512VL-NEXT: vpternlogq $202, %zmm3, %zmm2, %zmm0
; AVX512VL-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CD-LABEL: testv32i16:
; AVX512CD: # %bb.0:
-; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm3
-; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm2
+; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4
+; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4
-; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
-; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1
-; AVX512CD-NEXT: vpaddb %ymm4, %ymm1, %ymm1
-; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm4
-; AVX512CD-NEXT: vpaddb %ymm1, %ymm4, %ymm1
-; AVX512CD-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm2
-; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm2, %ymm2
+; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX512CD-NEXT: vpaddb %ymm4, %ymm2, %ymm2
+; AVX512CD-NEXT: vpsllw $8, %ymm2, %ymm4
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm4, %ymm2
+; AVX512CD-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm1
+; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1
; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0
-; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm2
-; AVX512CD-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512CD-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv32i16:
;
; AVX512VPOPCNTDQ-LABEL: testv32i16:
; AVX512VPOPCNTDQ: # %bb.0:
-; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm2, %zmm2
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
;
; BITALG-LABEL: testv32i16:
define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CD-LABEL: testv32i16u:
; AVX512CD: # %bb.0:
-; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm3
-; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm2
+; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4
+; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4
-; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
-; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1
-; AVX512CD-NEXT: vpaddb %ymm4, %ymm1, %ymm1
-; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm4
-; AVX512CD-NEXT: vpaddb %ymm1, %ymm4, %ymm1
-; AVX512CD-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm2
-; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm2, %ymm2
+; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX512CD-NEXT: vpaddb %ymm4, %ymm2, %ymm2
+; AVX512CD-NEXT: vpsllw $8, %ymm2, %ymm4
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm4, %ymm2
+; AVX512CD-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm1
+; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1
; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0
-; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm2
-; AVX512CD-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512CD-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv32i16u:
;
; AVX512VPOPCNTDQ-LABEL: testv32i16u:
; AVX512VPOPCNTDQ: # %bb.0:
-; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm2, %zmm2
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
;
; BITALG-LABEL: testv32i16u:
;
; AVX512F-LABEL: zext_32i8_to_32i16:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: zext_32i8_to_32i16:
;
; AVX512F-LABEL: test_abs_lt_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc1,0x01]
-; AVX512F-NEXT: vpabsb %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x7d,0x1c,0xc9]
+; AVX512F-NEXT: vpabsb %ymm0, %ymm1 # encoding: [0xc4,0xe2,0x7d,0x1c,0xc8]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc0,0x01]
; AVX512F-NEXT: vpabsb %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x1c,0xc0]
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc1,0x01]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 # encoding: [0x62,0xf3,0xf5,0x48,0x3a,0xc0,0x01]
; AVX512F-NEXT: retq # encoding: [0xc3]
;
; AVX512BW-LABEL: test_abs_lt_v64i8:
;
; AVX512F-LABEL: test_abs_gt_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc1,0x01]
-; AVX512F-NEXT: vpabsw %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x7d,0x1d,0xc9]
+; AVX512F-NEXT: vpabsw %ymm0, %ymm1 # encoding: [0xc4,0xe2,0x7d,0x1d,0xc8]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc0,0x01]
; AVX512F-NEXT: vpabsw %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x1d,0xc0]
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc1,0x01]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 # encoding: [0x62,0xf3,0xf5,0x48,0x3a,0xc0,0x01]
; AVX512F-NEXT: retq # encoding: [0xc3]
;
; AVX512BW-LABEL: test_abs_gt_v32i16: