Noticed in D150143/D150526 - we currently create scalar Constant values using the broadcast instruction width, which might be wider than the original build vector width, making it tricky to recognise the original constant bits data.
If we have widened the broadcast value, its much more useful for asm comments if we create a ConstantVector with the original element data, add that to the constant-pool and load that with the same (wider) broadcast instruction.
Mask = CFP->getValueAPF().bitcastToAPInt();
return true;
}
+ if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
+ Type *Ty = CDS->getType();
+ Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
+ unsigned EltBits = CDS->getElementType()->getPrimitiveSizeInBits();
+ for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
+ Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
+ return true;
+ }
return false;
};
if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
EltSizeInBits <= VT.getScalarSizeInBits()) {
auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
- if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
+ if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
return false;
SDValue Ptr = MemIntr->getBasePtr();
if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
- unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
+ unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
APInt UndefSrcElts(NumSrcElts, 0);
static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
unsigned SplatBitSize, LLVMContext &C) {
unsigned ScalarSize = VT.getScalarSizeInBits();
- unsigned NumElm = SplatBitSize / ScalarSize;
- SmallVector<Constant *, 32> ConstantVec;
- for (unsigned i = 0; i < NumElm; i++) {
- APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
- Constant *Const;
+ auto getConstantScalar = [&](const APInt &Val) -> Constant * {
if (VT.isFloatingPoint()) {
- if (ScalarSize == 16) {
- Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
- } else if (ScalarSize == 32) {
- Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
- } else {
- assert(ScalarSize == 64 && "Unsupported floating point scalar size");
- Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
- }
- } else
- Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
- ConstantVec.push_back(Const);
+ if (ScalarSize == 16)
+ return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
+ if (ScalarSize == 32)
+ return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
+ assert(ScalarSize == 64 && "Unsupported floating point scalar size");
+ return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
+ }
+ return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
+ };
+
+ if (ScalarSize == SplatBitSize)
+ return getConstantScalar(SplatValue);
+
+ unsigned NumElm = SplatBitSize / ScalarSize;
+ SmallVector<Constant *, 32> ConstantVec;
+ for (unsigned I = 0; I != NumElm; ++I) {
+ APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
+ ConstantVec.push_back(getConstantScalar(Val));
}
return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
}
if (Subtarget.hasAVX()) {
if (SplatBitSize == 32 || SplatBitSize == 64 ||
(SplatBitSize < 32 && Subtarget.hasAVX2())) {
- // Splatted value can fit in one INTEGER constant in constant pool.
- // Load the constant and broadcast it.
+ // Load the constant scalar/subvector and broadcast it.
MVT CVT = MVT::getIntegerVT(SplatBitSize);
- Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
- Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
+ Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
SDValue CP = DAG.getConstantPool(C, PVT);
unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
printConstant(CI->getValue(), CS);
} else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
printConstant(CF->getValueAPF(), CS);
+ } else if (auto *CDS = dyn_cast<ConstantDataSequential>(COp)) {
+ Type *EltTy = CDS->getElementType();
+ bool IsInteger = EltTy->isIntegerTy();
+ bool IsFP = EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
+ for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I) {
+ if (I != 0)
+ CS << ",";
+ if (IsInteger)
+ printConstant(CDS->getElementAsAPInt(I), CS);
+ else if (IsFP)
+ printConstant(CDS->getElementAsAPFloat(I), CS);
+ else
+ CS << "?";
+ }
} else {
CS << "?";
}
; AVX512F-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX512F-NEXT: vpternlogd $202, (%rdi){1to8}, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040]
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX512DQ-NEXT: vpternlogd $202, (%rdi){1to8}, %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpternlogq $202, (%rdi){1to4}, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX512DQ-NEXT: vpternlogq $202, (%rdi){1to4}, %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX512F-NEXT: vpternlogd $202, (%rdi){1to8}, %ymm0, %ymm1
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512DQ-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040]
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX512DQ-NEXT: vpternlogd $202, (%rdi){1to8}, %ymm0, %ymm1
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpternlogq $202, (%rdi){1to4}, %ymm0, %ymm1
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX512DQ-NEXT: vpternlogq $202, (%rdi){1to4}, %ymm0, %ymm1
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
;
; AVX1-LABEL: avg_v32i8_const:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [506097522914230528,506097522914230528]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX1-NEXT: # xmm0 = mem[0,0]
; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1
; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm0
;
; AVX1-LABEL: avg_v64i8_const:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [506097522914230528,506097522914230528]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX1-NEXT: # xmm0 = mem[0,0]
; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1
; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm2
;
; AVX2-LABEL: avg_v64i8_const:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm1
; AVX2-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
;
; AVX512F-LABEL: avg_v64i8_const:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm1
; AVX512F-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [60129542148,60129542148]
+; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [4,14,4,14]
; CHECK-NEXT: # xmm2 = mem[0,0]
; CHECK-NEXT: vmovaps 32(%rdi), %ymm3
; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm3
define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [60129542148,60129542148]
+; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [4,14,4,14]
; CHECK-NEXT: # xmm2 = mem[0,0]
; CHECK-NEXT: vmovaps 32(%rdi), %ymm1
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovd %edi, %xmm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3
; AVX2-NEXT: vmovq %rdi, %xmm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23]
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
; AVX2-NEXT: vmovd %edi, %xmm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3
; AVX2-NEXT: vmovq %rdi, %xmm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
; AVX2-NEXT: vmovd %edi, %xmm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
;
; ALL32-LABEL: f16xi8_i16:
; ALL32: # %bb.0:
-; ALL32-NEXT: vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256]
+; ALL32-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
; ALL64-LABEL: f16xi8_i16:
; ALL64: # %bb.0:
-; ALL64-NEXT: vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256]
+; ALL64-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
define <16 x i8> @f16xi8_i32(<16 x i8> %a) {
; AVX-LABEL: f16xi8_i32:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: retl
;
; ALL32-LABEL: f16xi8_i32:
; ALL32: # %bb.0:
-; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976]
+; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f16xi8_i32:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976]
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f16xi8_i32:
; ALL64: # %bb.0:
-; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976]
+; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
define <16 x i8> @f16xi8_i64(<16 x i8> %a) {
; AVX-LABEL: f16xi8_i64:
; AVX: # %bb.0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [506097522914230528,506097522914230528]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
;
; ALL32-LABEL: f16xi8_i64:
; ALL32: # %bb.0:
-; ALL32-NEXT: vpbroadcastq {{.*#+}} xmm1 = [506097522914230528,506097522914230528]
+; ALL32-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f16xi8_i64:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [506097522914230528,506097522914230528]
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX-64-NEXT: # xmm1 = mem[0,0]
; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
;
; ALL64-LABEL: f16xi8_i64:
; ALL64: # %bb.0:
-; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [506097522914230528,506097522914230528]
+; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
;
; ALL32-LABEL: f32xi8_i16:
; ALL32: # %bb.0:
-; ALL32-NEXT: vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; ALL32-NEXT: vpbroadcastw {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
;
; ALL64-LABEL: f32xi8_i16:
; ALL64: # %bb.0:
-; ALL64-NEXT: vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; ALL64-NEXT: vpbroadcastw {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT: retq
; AVX-LABEL: f32xi8_i32:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [50462976,50462976,50462976,50462976]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
;
; ALL32-LABEL: f32xi8_i32:
; ALL32: # %bb.0:
-; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
+; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
; AVX-64-LABEL: f32xi8_i32:
; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [50462976,50462976,50462976,50462976]
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
;
; ALL64-LABEL: f32xi8_i32:
; ALL64: # %bb.0:
-; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
+; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT: retq
; AVX-LABEL: f32xi8_i64:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [506097522914230528,506097522914230528]
+; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX-NEXT: # xmm2 = mem[0,0]
; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
;
; ALL32-LABEL: f32xi8_i64:
; ALL32: # %bb.0:
-; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
; AVX-64-LABEL: f32xi8_i64:
; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [506097522914230528,506097522914230528]
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX-64-NEXT: # xmm2 = mem[0,0]
; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0
;
; ALL64-LABEL: f32xi8_i64:
; ALL64: # %bb.0:
-; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT: retq
;
; AVX2-LABEL: f64xi8_i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
;
; AVX512BW-LABEL: f64xi8_i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retl
;
; AVX2-64-LABEL: f64xi8_i16:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX2-64-NEXT: vpbroadcastw {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
;
; AVX512BW-64-LABEL: f64xi8_i16:
; AVX512BW-64: # %bb.0:
-; AVX512BW-64-NEXT: vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX512BW-64-NEXT: vpbroadcastw {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT: retq
define <64 x i8> @f64i8_i32(<64 x i8> %a) {
; AVX-LABEL: f64i8_i32:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3
; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
;
; AVX2-LABEL: f64i8_i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
;
; AVX512BW-LABEL: f64i8_i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retl
;
; AVX-64-LABEL: f64i8_i32:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
+; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3
; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
;
; AVX2-64-LABEL: f64i8_i32:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
+; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
;
; AVX512BW-64-LABEL: f64i8_i32:
; AVX512BW-64: # %bb.0:
-; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
+; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT: retq
define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
; AVX-LABEL: f64xi8_i64:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3
; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
;
; AVX2-LABEL: f64xi8_i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
;
; AVX512BW-LABEL: f64xi8_i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retl
;
; AVX-64-LABEL: f64xi8_i64:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3
; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
;
; AVX2-64-LABEL: f64xi8_i64:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
;
; AVX512BW-64-LABEL: f64xi8_i64:
; AVX512BW-64: # %bb.0:
-; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT: retq
define <8 x i16> @f8xi16_i32(<8 x i16> %a) {
; AVX-LABEL: f8xi16_i32:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [65536,65536,65536,65536]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1]
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: retl
;
; ALL32-LABEL: f8xi16_i32:
; ALL32: # %bb.0:
-; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536]
+; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1]
; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f8xi16_i32:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [65536,65536,65536,65536]
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1]
; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f8xi16_i32:
; ALL64: # %bb.0:
-; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536]
+; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1]
; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
define <8 x i16> @f8xi16_i64(<8 x i16> %a) {
; AVX-LABEL: f8xi16_i64:
; AVX: # %bb.0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [844433520132096,844433520132096]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
;
; ALL32-LABEL: f8xi16_i64:
; ALL32: # %bb.0:
-; ALL32-NEXT: vpbroadcastq {{.*#+}} xmm1 = [844433520132096,844433520132096]
+; ALL32-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f8xi16_i64:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [844433520132096,844433520132096]
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
; AVX-64-NEXT: # xmm1 = mem[0,0]
; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
;
; ALL64-LABEL: f8xi16_i64:
; ALL64: # %bb.0:
-; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [844433520132096,844433520132096]
+; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
; AVX-LABEL: f16xi16_i32:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [65536,65536,65536,65536]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1]
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
;
; ALL32-LABEL: f16xi16_i32:
; ALL32: # %bb.0:
-; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536]
+; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
; AVX-64-LABEL: f16xi16_i32:
; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [65536,65536,65536,65536]
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1]
; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0
; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
;
; ALL64-LABEL: f16xi16_i32:
; ALL64: # %bb.0:
-; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536]
+; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT: retq
; AVX-LABEL: f16xi16_i64:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [844433520132096,844433520132096]
+; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3]
; AVX-NEXT: # xmm2 = mem[0,0]
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0
;
; ALL32-LABEL: f16xi16_i64:
; ALL32: # %bb.0:
-; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [844433520132096,844433520132096,844433520132096,844433520132096]
+; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
; AVX-64-LABEL: f16xi16_i64:
; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [844433520132096,844433520132096]
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3]
; AVX-64-NEXT: # xmm2 = mem[0,0]
; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0
;
; ALL64-LABEL: f16xi16_i64:
; ALL64: # %bb.0:
-; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [844433520132096,844433520132096,844433520132096,844433520132096]
+; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT: retq
define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
; AVX-LABEL: f32xi16_i32:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
;
; AVX2-LABEL: f32xi16_i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
;
; AVX512BW-LABEL: f32xi16_i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536]
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retl
;
; AVX-64-LABEL: f32xi16_i32:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
+; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3
; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1
;
; AVX2-64-LABEL: f32xi16_i32:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
+; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
;
; AVX512BW-64-LABEL: f32xi16_i32:
; AVX512BW-64: # %bb.0:
-; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536]
+; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT: retq
define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
; AVX-LABEL: f32xi16_i64:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096]
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
;
; AVX2-LABEL: f32xi16_i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
;
; AVX512BW-LABEL: f32xi16_i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retl
;
; AVX-64-LABEL: f32xi16_i64:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096]
+; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3
; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1
;
; AVX2-64-LABEL: f32xi16_i64:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096]
+; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
;
; AVX512BW-64-LABEL: f32xi16_i64:
; AVX512BW-64: # %bb.0:
-; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096]
+; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT: retq
define <4 x i32> @f4xi32_i64(<4 x i32> %a) {
; AVX-LABEL: f4xi32_i64:
; AVX: # %bb.0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4294967296,4294967296]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0,1,0,1]
; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
;
; ALL32-LABEL: f4xi32_i64:
; ALL32: # %bb.0:
-; ALL32-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967296,4294967296]
+; ALL32-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,0,1]
; ALL32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f4xi32_i64:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [4294967296,4294967296]
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [0,1,0,1]
; AVX-64-NEXT: # xmm1 = mem[0,0]
; AVX-64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
;
; ALL64-LABEL: f4xi32_i64:
; ALL64: # %bb.0:
-; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967296,4294967296]
+; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,0,1]
; ALL64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
; AVX-LABEL: f8xi32_i64:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [4294967296,4294967296]
+; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [0,1,0,1]
; AVX-NEXT: # xmm2 = mem[0,0]
; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
;
; ALL32-LABEL: f8xi32_i64:
; ALL32: # %bb.0:
-; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967296,4294967296,4294967296,4294967296]
+; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1]
; ALL32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
; AVX-64-LABEL: f8xi32_i64:
; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [4294967296,4294967296]
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [0,1,0,1]
; AVX-64-NEXT: # xmm2 = mem[0,0]
; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0
;
; ALL64-LABEL: f8xi32_i64:
; ALL64: # %bb.0:
-; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967296,4294967296,4294967296,4294967296]
+; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1]
; ALL64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT: retq
define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
; AVX-LABEL: f16xi32_i64:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296]
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX-NEXT: vpaddd %xmm2, %xmm3, %xmm3
; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
;
; AVX2-LABEL: f16xi32_i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
;
; AVX512-LABEL: f16xi32_i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296]
+; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retl
;
; AVX-64-LABEL: f16xi32_i64:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296]
+; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX-64-NEXT: vpaddd %xmm2, %xmm3, %xmm3
; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1
;
; AVX2-64-LABEL: f16xi32_i64:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296]
+; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
; AVX2-64-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
;
; AVX512F-64-LABEL: f16xi32_i64:
; AVX512F-64: # %bb.0:
-; AVX512F-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296]
+; AVX512F-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT: vpandd %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT: retq
define <4 x float> @f4xf32_f64(<4 x float> %a) {
; AVX-LABEL: f4xf32_f64:
; AVX: # %bb.0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
;
; ALL32-LABEL: f4xf32_f64:
; ALL32: # %bb.0:
-; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760]
+; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; ALL32-NEXT: # xmm1 = mem[0,0]
; ALL32-NEXT: vaddps %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vdivps %xmm0, %xmm1, %xmm0
;
; AVX-64-LABEL: f4xf32_f64:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760]
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX-64-NEXT: # xmm1 = mem[0,0]
; AVX-64-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: vdivps %xmm0, %xmm1, %xmm0
;
; ALL64-LABEL: f4xf32_f64:
; ALL64: # %bb.0:
-; ALL64-NEXT: vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760]
+; ALL64-NEXT: vmovddup {{.*#+}} xmm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; ALL64-NEXT: # xmm1 = mem[0,0]
; ALL64-NEXT: vaddps %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vdivps %xmm0, %xmm1, %xmm0
define <8 x float> @f8xf32_f64(<8 x float> %a) {
; AVX-LABEL: f8xf32_f64:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
; AVX-NEXT: retl
;
; ALL32-LABEL: f8xf32_f64:
; ALL32: # %bb.0:
-; ALL32-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
+; ALL32-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; ALL32-NEXT: vaddps %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vdivps %ymm0, %ymm1, %ymm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f8xf32_f64:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
+; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX-64-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-64-NEXT: vdivps %ymm0, %ymm1, %ymm0
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f8xf32_f64:
; ALL64: # %bb.0:
-; ALL64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
+; ALL64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vdivps %ymm0, %ymm1, %ymm0
; ALL64-NEXT: retq
define <16 x float> @f16xf32_f64(<16 x float> %a) {
; AVX-LABEL: f16xf32_f64:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0
;
; AVX2-LABEL: f16xf32_f64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0
;
; AVX512-LABEL: f16xf32_f64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vbroadcastsd {{.*#+}} zmm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
+; AVX512-NEXT: vbroadcastsd {{.*#+}} zmm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retl
;
; AVX-64-LABEL: f16xf32_f64:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
+; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0
;
; AVX2-64-LABEL: f16xf32_f64:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
+; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0
;
; AVX512F-64-LABEL: f16xf32_f64:
; AVX512F-64: # %bb.0:
-; AVX512F-64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
+; AVX512F-64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0
; AVX512F-64-NEXT: retq
define <8 x i16> @f8xi16_i32_NaN(<8 x i16> %a) {
; AVX-LABEL: f8xi16_i32_NaN:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,65466,0,65466,0,65466,0,65466]
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: retl
;
; ALL32-LABEL: f8xi16_i32_NaN:
; ALL32: # %bb.0:
-; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776]
+; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,65466,0,65466,0,65466,0,65466]
; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f8xi16_i32_NaN:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776]
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,65466,0,65466,0,65466,0,65466]
; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f8xi16_i32_NaN:
; ALL64: # %bb.0:
-; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776]
+; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,65466,0,65466,0,65466,0,65466]
; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
; AVX2-NEXT: pushq %rax
; AVX2-NEXT: callq use@PLT
; AVX2-NEXT: vmovdqu (%rax), %xmm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [63519,63519,63519,63519,63519,63519,63519,63519]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [31,248,31,248,31,248,31,248,31,248,31,248,31,248,31,248]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpavgb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: pushq %rax
; AVX512F-NEXT: callq use@PLT
; AVX512F-NEXT: vmovdqu (%rax), %xmm1
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [63519,63519,63519,63519,63519,63519,63519,63519]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [31,248,31,248,31,248,31,248,31,248,31,248,31,248,31,248]
; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpandn %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: pushq %rax
; AVX512VL-NEXT: callq use@PLT
; AVX512VL-NEXT: vmovdqu (%rax), %xmm1
-; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [63519,63519,63519,63519,63519,63519,63519,63519]
+; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [31,248,31,248,31,248,31,248,31,248,31,248,31,248,31,248]
; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpavgb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpandn %xmm1, %xmm0, %xmm0
define i32 @mul_64xi8_zc(<64 x i8> %a, i32 %c) {
; AVXVNNI-LABEL: mul_64xi8_zc:
; AVXVNNI: # %bb.0: # %entry
-; AVXVNNI-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1073873152,1073873152,1073873152,1073873152,1073873152,1073873152,1073873152,1073873152]
+; AVXVNNI-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64]
; AVXVNNI-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVXVNNI-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm4
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl (%edx), %edx
; X86-NEXT: andl $15, %edx
-; X86-NEXT: vmovddup {{.*#+}} xmm0 = [7,7]
+; X86-NEXT: vmovddup {{.*#+}} xmm0 = [7,0,7,0]
; X86-NEXT: # xmm0 = mem[0,0]
; X86-NEXT: vmovd %edx, %xmm1
; X86-NEXT: vpand %xmm0, %xmm1, %xmm2
; X86-AVX1-LABEL: test_reduce_v2i64:
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
; X86-AVX1-NEXT: ## xmm2 = mem[0,0]
; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3
; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm2
; X86-AVX2-LABEL: test_reduce_v2i64:
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; X86-AVX1-LABEL: test_reduce_v4i64:
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
; X86-AVX1-NEXT: ## xmm2 = mem[0,0]
; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm3
; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4
; X86-AVX2-LABEL: test_reduce_v4i64:
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4
; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
;
; X86-AVX1-LABEL: test_reduce_v8i64:
; X86-AVX1: ## %bb.0:
-; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
; X86-AVX1-NEXT: ## xmm2 = mem[0,0]
; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm3
; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4
;
; X86-AVX2-LABEL: test_reduce_v8i64:
; X86-AVX2: ## %bb.0:
-; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4
; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
; X86-AVX1-LABEL: test_reduce_v2i64:
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
; X86-AVX1-NEXT: ## xmm2 = mem[0,0]
; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3
; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm2
; X86-AVX2-LABEL: test_reduce_v2i64:
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
;
; X86-AVX1-LABEL: test_reduce_v4i64:
; X86-AVX1: ## %bb.0:
-; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
; X86-AVX1-NEXT: ## xmm1 = mem[0,0]
; X86-AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X86-AVX2-LABEL: test_reduce_v4i64:
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4
; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X86-AVX1-LABEL: test_reduce_v8i64:
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
; X86-AVX1-NEXT: ## xmm2 = mem[0,0]
; X86-AVX1-NEXT: vxorps %xmm2, %xmm3, %xmm4
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
;
; X86-AVX2-LABEL: test_reduce_v8i64:
; X86-AVX2: ## %bb.0:
-; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
;
; X86-AVX-LABEL: clamp_sitofp_2i64_2f64:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551361,18446744073709551361]
+; X86-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4294967041,4294967295,4294967041,4294967295]
; X86-AVX-NEXT: # xmm1 = mem[0,0]
; X86-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
+; X86-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [255,0,255,0]
; X86-AVX-NEXT: # xmm1 = mem[0,0]
; X86-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-LABEL: truncstore_v8i64_v8i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-LABEL: truncstore_v16i32_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm5, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-LABEL: truncstore_v32i16_v32i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vmovups (%rdi), %ymm0
; AVX2-SLOW-NEXT: vmovups 32(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovups 64(%rdi), %ymm2
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
; AVX2-FAST-ALL-NEXT: vmovups (%rdi), %ymm0
; AVX2-FAST-ALL-NEXT: vmovups 32(%rdi), %ymm1
; AVX2-FAST-ALL-NEXT: vmovups 64(%rdi), %ymm2
-; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm3, %ymm3
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
; AVX2-FAST-PERLANE-NEXT: vmovups (%rdi), %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovups 32(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups 64(%rdi), %ymm2
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm3
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; X86-AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,1]
+; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,0,1,0]
; X86-AVX1-NEXT: # xmm2 = mem[0,0]
; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpmovd2m %zmm0, %k1
; CHECK-NEXT: vmovapd 0, %zmm0
; CHECK-NEXT: vmovapd 64, %zmm1
-; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm2 = [68719476736,68719476736,68719476736,68719476736,68719476736,68719476736,68719476736,68719476736]
+; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm2 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16]
; CHECK-NEXT: kshiftrw $8, %k1, %k2
; CHECK-NEXT: vorpd %zmm2, %zmm1, %zmm1 {%k2}
; CHECK-NEXT: vorpd %zmm2, %zmm0, %zmm0 {%k1}
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm2
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3
; AVX2-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
; AVX2-SLOW-NEXT: ret{{[l|q]}}
; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
; AVX2-32-NEXT: retl
; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: retq
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
; AVX2-SLOW-NEXT: ret{{[l|q]}}
; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
; AVX2-32-NEXT: retl
; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: retq
;
; AVX1-LABEL: uaddo_v4i24:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
;
; AVX2-LABEL: uaddo_v4i24:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
;
; AVX512-LABEL: uaddo_v4i24:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
;
; AVX1-LABEL: umulo_v4i24:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,3,3]
;
; AVX2-LABEL: umulo_v4i24:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
;
; AVX512-LABEL: umulo_v4i24:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
;
; AVX1-LABEL: usubo_v4i24:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
;
; AVX2-LABEL: usubo_v4i24:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
;
; AVX512-LABEL: usubo_v4i24:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
;
; GFNIAVX1-LABEL: test_bitreverse_v64i8:
; GFNIAVX1: # %bb.0:
-; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
; GFNIAVX1-NEXT: retq
; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
;
; AVX1-LABEL: constant_pblendvb_avx2:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [18374686483949879295,18374686483949879295,18374686483949879295,18374686483949879295]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5
; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm5
;
; AVX512VL-LABEL: splatvar_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15]
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5
; AVX512VL-NEXT: vpsrlw $1, %ymm5, %ymm5
define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm5
;
; AVX512VL-LABEL: splatvar_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15]
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm5
define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5
; AVX512F-NEXT: vpsrlw %xmm4, %ymm5, %ymm5
;
; AVX512VL-LABEL: splatvar_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15]
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5
; AVX512VL-NEXT: vpsrlw %xmm4, %ymm5, %ymm5
define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm5
;
; AVX512VL-LABEL: splatvar_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15]
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm5
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [2,5,2,5,2,5,2,5]
; AVX2-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm13
; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7]
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7]
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [21474836482,21474836482,21474836482,21474836482]
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5]
; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14
; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [2,5,2,5,2,5,2,5]
; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm11
; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [21474836482,21474836482,21474836482,21474836482]
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5]
; AVX512F-ONLY-FAST-NEXT: vpermd %ymm28, %ymm2, %ymm2
; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1],xmm11[2,3,4,5],xmm6[6],xmm11[7]
; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [21474836482,21474836482,21474836482,21474836482]
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [2,5,2,5,2,5,2,5]
; AVX512DQ-FAST-NEXT: vpermd %ymm27, %ymm11, %ymm11
; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3,4,5,6],ymm11[7]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm3
; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4
; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [21474836482,21474836482,21474836482,21474836482]
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [2,5,2,5,2,5,2,5]
; AVX512F-ONLY-FAST-NEXT: vpermd %ymm18, %ymm11, %ymm12
; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm12[7]
; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm29
; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [21474836482,21474836482,21474836482,21474836482]
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [2,5,2,5,2,5,2,5]
; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm10, %ymm11
; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm11[7]
; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0
; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm2
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0
; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1
; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm2
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm2
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm3
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm3
; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm4
; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm5
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [2,5,2,5,2,5,2,5]
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm6, %ymm7
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm9 = <0,3,6,1,4,7,u,u>
; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm3
; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm4
; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm5
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm6 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm6 = [2,5,2,5,2,5,2,5]
; AVX2-FAST-NEXT: vpermps %ymm4, %ymm6, %ymm7
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = <0,3,6,1,4,7,u,u>
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm4
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm5
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [2,5,2,5,2,5,2,5]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm6, %ymm7
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm9 = <0,3,6,1,4,7,u,u>
; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm2
; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm15
; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm12
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm8 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5]
; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm8, %ymm0
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6],ymm10[7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm11 = <0,3,6,1,4,7,u,u>
; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm7
; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm11
; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm12
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5]
; AVX2-FAST-NEXT: vpermps %ymm12, %ymm8, %ymm0
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u>
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm2
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm15
; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm12
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm8 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm8, %ymm0
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6],ymm10[7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm11 = <0,3,6,1,4,7,u,u>
; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm12
; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5]
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6],ymm2[7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u>
; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm14
; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm10[1],ymm14[2,3],ymm10[4],ymm14[5,6],ymm10[7]
; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm12
; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [21474836482,21474836482,21474836482,21474836482]
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u>
; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4
; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1
; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm3 = [17179869184,17179869184,17179869184,17179869184]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm3 = [0,4,0,4,0,4,0,4]
; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm3, %ymm5
; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm3
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [21474836481,21474836481,21474836481,21474836481]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1,5,1,5,1,5,1,5]
; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm9
; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm6
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm11, %ymm11
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [25769803778,25769803778,25769803778,25769803778]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6]
; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm10, %ymm11
; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm10, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm7 = [30064771075,30064771075,30064771075,30064771075]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm7 = [3,7,3,7,3,7,3,7]
; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm7, %ymm2
; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3
; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [17179869184,17179869184,17179869184,17179869184]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4]
; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1
; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [21474836481,21474836481,21474836481,21474836481]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,5,1,5,1,5,1,5]
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm1
; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm7 = [25769803778,25769803778,25769803778,25769803778]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm7 = [2,6,2,6,2,6,2,6]
; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm7, %ymm8
; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm9
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm9 = [30064771075,30064771075,30064771075,30064771075]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm9 = [3,7,3,7,3,7,3,7]
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm9, %ymm5
; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm9, %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3
; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [17179869184,17179869184,17179869184,17179869184]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4]
; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1
; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm10
; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836481,21474836481,21474836481,21474836481]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1,5,1,5,1,5,1,5]
; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm3, %ymm0
; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm3, %ymm1
; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [25769803778,25769803778,25769803778,25769803778]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6]
; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm11
; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm13
; AVX2-ONLY-NEXT: # xmm10 = xmm2[2],mem[2],xmm2[3],mem[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [30064771075,30064771075,30064771075,30064771075]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [3,7,3,7,3,7,3,7]
; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm10, %ymm0
; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm10, %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2
; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [17179869184,17179869184,17179869184,17179869184]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4]
; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0
; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm5
; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [21474836481,21474836481,21474836481,21474836481]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1,5,1,5,1,5,1,5]
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm0
; AVX2-ONLY-NEXT: vmovaps %ymm10, %ymm2
; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [25769803778,25769803778,25769803778,25769803778]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6]
; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm1
; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [30064771075,30064771075,30064771075,30064771075]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,7,3,7,3,7,3,7]
; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm1
; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [25769803777,25769803777]
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6]
; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm4
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [30064771074,30064771074]
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7]
; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm5
; AVX512F-FAST-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX512F-FAST-NEXT: vpbroadcastd 16(%rdi), %ymm1
; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [25769803777,25769803777]
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6]
; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm4
-; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [30064771074,30064771074]
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7]
; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm5
; AVX512BW-FAST-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX512BW-FAST-NEXT: vpbroadcastd 16(%rdi), %ymm1
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm8[7]
; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [21474836480,21474836480,21474836480,21474836480]
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,5,0,5,0,5,0,5]
; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm8, %ymm8
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [25769803777,25769803777,25769803777,25769803777]
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [1,6,1,6,1,6,1,6]
; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm8, %ymm8
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,0,2,2,7,4,6,6]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm4, %ymm1
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [30064771074,30064771074,30064771074,30064771074]
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,7,2,7,2,7,2,7]
; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%rsi)
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4,5,6],ymm15[7]
; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7]
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm15 = [21474836480,21474836480,21474836480,21474836480]
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,5,0,5,0,5,0,5]
; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm15, %ymm7
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7]
; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm3[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27]
; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4,5,6,7]
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm15 = [25769803777,25769803777,25769803777,25769803777]
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm15 = [1,6,1,6,1,6,1,6]
; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm15, %ymm11
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <4,1,6,u>
; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm5, %ymm3
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [30064771074,30064771074,30064771074,30064771074]
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [2,7,2,7,2,7,2,7]
; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm8, %ymm0
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm2
; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [21474836480,21474836480,21474836480,21474836480]
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: # ymm8 = mem[12,13,14,15],ymm12[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm12[16,17,18,19,20,21,22,23,24,25,26,27]
; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,3,2,3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [25769803777,25769803777,25769803777,25769803777]
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [1,6,1,6,1,6,1,6]
; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm8, %ymm10
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <4,1,6,u>
; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm8, %ymm0
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm7 = [30064771074,30064771074,30064771074,30064771074]
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm7, %ymm3
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm3
; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [21474836480,21474836480,21474836480,21474836480]
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm3
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = ymm12[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,3,2,3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [25769803777,25769803777,25769803777,25769803777]
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,6,1,6,1,6,1,6]
; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7]
; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <4,1,6,u>
; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm7, %ymm4
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [30064771074,30064771074,30064771074,30064771074]
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [2,7,2,7,2,7,2,7]
; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm6, %ymm1
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX512F-FAST-LABEL: load_i32_stride6_vf2:
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [25769803776,25769803776]
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6]
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1
; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm3
; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm0
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [30064771073,30064771073]
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7]
; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm4
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [17179869186,17179869186]
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4]
; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <3,5,u,u>
; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5
; AVX512BW-FAST-LABEL: load_i32_stride6_vf2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [25769803776,25769803776]
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6]
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm1
; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm3
; AVX512BW-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm0
-; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [30064771073,30064771073]
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7]
; AVX512BW-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm4
-; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [17179869186,17179869186]
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4]
; AVX512BW-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <3,5,u,u>
; AVX512BW-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm10 = [8589934596,8589934596,8589934596,8589934596]
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm10 = [4,2,4,2,4,2,4,2]
; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm10, %ymm10
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u>
; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm10, %ymm6
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm8 = [12884901893,12884901893,12884901893,12884901893]
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm8 = [5,3,5,3,5,3,5,3]
; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm8, %ymm8
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [8589934596,8589934596,8589934596,8589934596]
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [4,2,4,2,4,2,4,2]
; AVX2-FAST-NEXT: vpermps %ymm9, %ymm10, %ymm10
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u>
; AVX2-FAST-NEXT: vpermps %ymm6, %ymm10, %ymm6
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [12884901893,12884901893,12884901893,12884901893]
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [5,3,5,3,5,3,5,3]
; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm8
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm10 = [8589934596,8589934596,8589934596,8589934596]
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm10 = [4,2,4,2,4,2,4,2]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm10, %ymm10
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u>
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm10, %ymm6
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm8 = [12884901893,12884901893,12884901893,12884901893]
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm8 = [5,3,5,3,5,3,5,3]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm8, %ymm8
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [8589934596,8589934596,8589934596,8589934596]
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm14
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm14[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm3
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [12884901893,12884901893,12884901893,12884901893]
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm4, %ymm12
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [8589934596,8589934596,8589934596,8589934596]
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm14
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm14[6,7]
; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm3
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm4 = [12884901893,12884901893,12884901893,12884901893]
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
; AVX2-FAST-NEXT: vpermps %ymm13, %ymm4, %ymm12
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7]
; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [8589934596,8589934596,8589934596,8589934596]
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm14
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm14[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm3
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [12884901893,12884901893,12884901893,12884901893]
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm4, %ymm12
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [8589934596,8589934596,8589934596,8589934596]
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2]
; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm2, %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm11 = [12884901893,12884901893,12884901893,12884901893]
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,3,5,3,5,3,5,3]
; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm11, %ymm10
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [8589934596,8589934596,8589934596,8589934596]
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2]
; AVX2-FAST-NEXT: vpermps %ymm10, %ymm2, %ymm5
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7]
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [12884901893,12884901893,12884901893,12884901893]
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,3,5,3,5,3,5,3]
; AVX2-FAST-NEXT: vpermps %ymm10, %ymm11, %ymm10
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7]
; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm2 = [8589934596,8589934596,8589934596,8589934596]
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm2, %ymm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm11 = [12884901893,12884901893,12884901893,12884901893]
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,3,5,3,5,3,5,3]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm11, %ymm10
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm9 = [8589934596,8589934596,8589934596,8589934596]
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm9 = [4,2,4,2,4,2,4,2]
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm9, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [12884901893,12884901893,12884901893,12884901893]
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5,3,5,3,5,3,5,3]
; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [8589934596,8589934596,8589934596,8589934596]
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [4,2,4,2,4,2,4,2]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [12884901893,12884901893,12884901893,12884901893]
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5,3,5,3,5,3,5,3]
; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm9 = [8589934596,8589934596,8589934596,8589934596]
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm9 = [4,2,4,2,4,2,4,2]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm9, %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [12884901893,12884901893,12884901893,12884901893]
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5,3,5,3,5,3,5,3]
; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512F-FAST-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [17179869185,17179869185]
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4]
; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512F-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm4
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512BW-FAST-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
-; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [17179869185,17179869185]
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4]
; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512BW-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm4
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [30064771072,30064771072,30064771072,30064771072]
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [0,7,0,7,0,7,0,7]
; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm12, %ymm13
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm4[6,7]
; AVX2-SLOW-NEXT: vpbroadcastd 212(%rdi), %ymm14
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [25769803781,25769803781,25769803781,25769803781]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,6,5,6,5,6,5,6]
; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <1,0,7,u,u,u,u,u>
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [30064771072,30064771072,30064771072,30064771072]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [0,7,0,7,0,7,0,7]
; AVX2-FAST-NEXT: vpermd %ymm5, %ymm12, %ymm13
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm4[6,7]
; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm14
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm12 = [30064771072,30064771072,30064771072,30064771072]
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm12 = [0,7,0,7,0,7,0,7]
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm12, %ymm13
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm4[6,7]
; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 212(%rdi), %ymm14
; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12
; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm5, %ymm10
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3]
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [30064771072,30064771072,30064771072,30064771072]
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7]
; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm11
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpbroadcastd 212(%rdi), %ymm13
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm11[2,3],ymm3[4,5],ymm11[6,7]
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [25769803781,25769803781,25769803781,25769803781]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6]
; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm8
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,0,7,7,5,4,7,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7]
; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8
; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3]
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [30064771072,30064771072,30064771072,30064771072]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7]
; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm5
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7]
; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm7
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm12
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm5, %ymm10
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm0 = [30064771072,30064771072,30064771072,30064771072]
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7]
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm11
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 212(%rdi), %ymm13
; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm2
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [30064771072,30064771072,30064771072,30064771072]
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,7,0,7,0,7,0,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm4, %ymm2
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm0[7]
; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm6
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm6[2,3],ymm10[4,5],ymm6[6,7]
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [25769803781,25769803781,25769803781,25769803781]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6]
; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm3
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7]
; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [30064771072,30064771072,30064771072,30064771072]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7]
; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm3
; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm12
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm14[6,7]
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm2
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [30064771072,30064771072,30064771072,30064771072]
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,7,0,7,0,7,0,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm4, %ymm2
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7]
; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm5, %ymm2
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm9 = [30064771072,30064771072,30064771072,30064771072]
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm9 = [0,7,0,7,0,7,0,7]
; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm9, %ymm2
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm3
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3],ymm8[4,5],ymm0[6,7]
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [25769803781,25769803781,25769803781,25769803781]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,6,5,6,5,6,5,6]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm3
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,0,7,7,5,4,7,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7]
; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [30064771072,30064771072,30064771072,30064771072]
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [0,7,0,7,0,7,0,7]
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm10, %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
; AVX2-FAST-NEXT: vbroadcastss 212(%rdi), %ymm3
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm5, %ymm2
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm9 = [30064771072,30064771072,30064771072,30064771072]
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm9 = [0,7,0,7,0,7,0,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm9, %ymm2
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm3
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3]
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm1, %ymm9
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [17179869184,17179869184,17179869184,17179869184]
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4]
; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm6, %ymm9
; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm0, %ymm8
; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm6, %ymm8
; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm1, %ymm9
; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm4
; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [17179869184,17179869184,17179869184,17179869184]
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4]
; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm2, %ymm9
; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm0, %ymm11
; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm2, %ymm11
; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7]
; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [21474836480,21474836480,21474836480,21474836480]
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vmovdqa %ymm6, (%rsi)
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm1
; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [21474836480,21474836480,21474836480,21474836480]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
; AVX512F-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vpternlogq $184, %ymm0, %ymm8, %ymm1
; AVX512F-NEXT: vmovdqa %ymm6, (%rsi)
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [21474836480,21474836480,21474836480,21474836480]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
; AVX512BW-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k3}
; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi)
; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm10
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = <u,1,6,11,0,5,10,15,u,u,u,u,u,u,u,u,u,1,6,11,0,5,10,15,u,u,u,u,u,u,u,u>
; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm10, %ymm10
-; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [21474836480,21474836480,21474836480,21474836480]
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm0, %ymm10
; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm6, %ymm10, %ymm6
; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm2
; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm5 = [21474836480,21474836480,21474836480,21474836480]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5]
; AVX512F-NEXT: vpermd %ymm2, %ymm5, %ymm2
; AVX512F-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [21474836480,21474836480,21474836480,21474836480]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5]
; AVX512BW-NEXT: vpermd %ymm2, %ymm3, %ymm2
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5}
; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2]
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u>
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5
-; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1099511627775,1099511627775]
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,0,0,0]
; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm10, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [17179869184,17179869184,17179869184,17179869184]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [0,4,0,4,0,4,0,4]
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm10, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7]
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [25769803778,25769803778,25769803778,25769803778]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6]
; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm10, %ymm6
; AVX2-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [1,5,2,6,1,5,2,6]
; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm9, %ymm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7]
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [17179869184,17179869184,17179869184,17179869184]
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4]
; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm6, %ymm6
; AVX2-SLOW-NEXT: vbroadcastss (%r10), %ymm7
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm3 = [17179869184,17179869184,17179869184,17179869184]
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm3 = [0,4,0,4,0,4,0,4]
; AVX2-FAST-NEXT: vpermps %ymm6, %ymm3, %ymm3
; AVX2-FAST-NEXT: vbroadcastss (%r10), %ymm5
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm9, %ymm4
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [17179869184,17179869184,17179869184,17179869184]
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm6, %ymm6
; AVX2-FAST-PERLANE-NEXT: vbroadcastss (%r10), %ymm7
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm10
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm10[6,7]
; AVX2-FAST-NEXT: vmovaps 32(%r9), %ymm4
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm15 = [25769803781,25769803781,25769803781,25769803781]
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm15 = [5,6,5,6,5,6,5,6]
; AVX2-FAST-NEXT: vpermps %ymm4, %ymm15, %ymm15
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7]
; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7]
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [25769803781,25769803781,25769803781,25769803781]
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,6,5,6,5,6,5,6]
; AVX2-FAST-NEXT: vpermps 96(%r9), %ymm11, %ymm11
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm15[1,2,3,4,5,6],ymm11[7]
; AVX2-FAST-NEXT: vmovaps 96(%rax), %ymm15
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm7 = [25769803781,25769803781,25769803781,25769803781]
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm7 = [5,6,5,6,5,6,5,6]
; AVX2-FAST-NEXT: vpermps 224(%r9), %ymm7, %ymm7
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7]
; AVX2-FAST-NEXT: vmovaps 224(%rax), %ymm7
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8
; AVX2-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm5, %ymm5
; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm6, %ymm7
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [17179869184,17179869184,17179869184,17179869184]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4]
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm6, %ymm9
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm6
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm9 = [21474836481,21474836481,21474836481,21474836481]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm9 = [1,5,1,5,1,5,1,5]
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm9, %ymm10
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm9, %ymm9
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [25769803778,25769803778,25769803778,25769803778]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6]
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm10, %ymm11
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm10, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm4 = [30064771075,30064771075,30064771075,30064771075]
+; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm4 = [3,7,3,7,3,7,3,7]
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm7
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm4, %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7]
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5]
-; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [281474976710655,281474976710655,281474976710655,281474976710655]
+; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm7
; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm8
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm15[0,2,2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm2, %ymm14
-; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [18446744069414649855,18446744069414649855,18446744069414649855,18446744069414649855]
+; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm13
; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm2, %ymm14
; AVX1-ONLY-NEXT: vorps %ymm14, %ymm13, %ymm13
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm5 = [281474976710655,281474976710655,281474976710655,281474976710655]
+; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm5 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0
; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3
; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm4[2,1,3,3,6,5,7,7]
-; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm4 = [18446744069414649855,18446744069414649855,18446744069414649855,18446744069414649855]
+; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm4 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm4, %ymm8
; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7
; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7
; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX512F-SLOW-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm6
-; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm5 = [18446744069414649855,18446744069414649855,18446744069414649855,18446744069414649855,18446744069414649855,18446744069414649855,18446744069414649855,18446744069414649855]
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm5 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm6
; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512F-SLOW-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5]
; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
; AVX512F-SLOW-NEXT: # ymm7 = mem[0,2,2,3,4,6,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
-; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm0 = [281474976710655,281474976710655,281474976710655,281474976710655,281474976710655,281474976710655,281474976710655,281474976710655]
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
; AVX512F-SLOW-NEXT: vpandnq %zmm4, %zmm0, %zmm4
; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm7, %zmm7
; AVX512F-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u>
; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm8
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm6 = [18446744069414649855,18446744069414649855,18446744069414649855,18446744069414649855,18446744069414649855,18446744069414649855,18446744069414649855,18446744069414649855]
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm6 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm8 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm0 = [281474976710655,281474976710655,281474976710655,281474976710655,281474976710655,281474976710655,281474976710655,281474976710655]
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
; AVX512F-FAST-NEXT: vpandnq %zmm29, %zmm0, %zmm29
; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm17, %zmm17
; AVX512F-FAST-NEXT: movw $-21846, %ax # imm = 0xAAAA
;
; AVX2-LABEL: test_v16i32_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
;
; AVX2-LABEL: test_v32i32_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
;
; AVX2-LABEL: trunc_v16i32_v16i1:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
;
; AVX2-LABEL: trunc_v32i16_v32i1:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512F-LABEL: splatvar_rotate_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm5
;
; AVX512VL-LABEL: splatvar_rotate_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15]
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm5
; AVX1-NEXT: movzwl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: movzwl (%rdi), %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
; X86-AVX1-LABEL: var_shift_v4i64:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
; X86-AVX1-NEXT: # xmm3 = mem[0,0]
; X86-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
; X86-AVX1-LABEL: PR52719:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
; X86-AVX1-NEXT: # xmm2 = mem[0,0]
; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
;
; AVX512VLVBMI-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5122,5122,5122,5122,5122,5122,5122,5122]
+; AVX512VLVBMI-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,20,2,20,2,20,2,20,2,20,2,20,2,20,2,20]
; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
; AVX512VLVBMI-NEXT: retq
;
;
; AVX512VL-LABEL: shuffle_v8i16_048C048C:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [3377734080528384,3377734080528384]
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12]
; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: retq
;
define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
;
; XOPAVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
; XOPAVX1-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
;
; XOPAVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
; XOPAVX1-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
define <32 x i8> @load_fold_pblendvb(ptr %px, <32 x i8> %y) {
; AVX1-LABEL: load_fold_pblendvb:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [18374686483949879295,18374686483949879295,18374686483949879295,18374686483949879295]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
; AVX1-NEXT: vandnps (%rdi), %ymm1, %ymm2
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
;
; XOPAVX1-LABEL: load_fold_pblendvb:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [18374686483949879295,18374686483949879295,18374686483949879295,18374686483949879295]
+; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
; XOPAVX1-NEXT: vpcmov %ymm1, (%rdi), %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
define <32 x i8> @load_fold_pblendvb_commute(ptr %px, <32 x i8> %y) {
; AVX1-LABEL: load_fold_pblendvb_commute:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [18374686483949879295,18374686483949879295,18374686483949879295,18374686483949879295]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vandps (%rdi), %ymm1, %ymm1
; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; XOPAVX1-LABEL: load_fold_pblendvb_commute:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vmovdqa (%rdi), %ymm1
-; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [18374686483949879295,18374686483949879295,18374686483949879295,18374686483949879295]
+; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
; XOPAVX1-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; AVX2-LABEL: PR55066:
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [17179869184,17179869184,17179869184,17179869184]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,4,0,4,0,4,0,4]
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: PR55066:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [17179869184,17179869184,17179869184,17179869184]
+; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,4,0,4,0,4,0,4]
; AVX512VLBW-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: PR55066:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2024390091656922112,2024390091656922112,2024390091656922112,2024390091656922112]
+; AVX512VLVBMI-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
; XOPAVX2-LABEL: PR55066:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
-; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [17179869184,17179869184,17179869184,17179869184]
+; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,4,0,4,0,4,0,4]
; XOPAVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a0, <32 x i8> poison, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
;
; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_08080808:
; AVX512VL-FAST-ALL: # %bb.0:
-; AVX512VL-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm2 = [34359738368,34359738368,34359738368,34359738368]
+; AVX512VL-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,8,0,8,0,8,0,8]
; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
; AVX512VL-FAST-ALL-NEXT: retq
;
;
; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_08080808:
; AVX512VL-FAST-ALL: # %bb.0:
-; AVX512VL-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [34359738368,34359738368,34359738368,34359738368]
+; AVX512VL-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,8,0,8,0,8,0,8]
; AVX512VL-FAST-ALL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0
; AVX512VL-FAST-ALL-NEXT: retq
;
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
; AVX512F-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2
;
; AVX512F-LABEL: test_mm512_mask_blend_epi8:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0
; AVX512F-NEXT: ret{{[l|q]}}
; AVX2-NEXT: vpaddq %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpaddq %ymm6, %ymm2, %ymm2
; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
; AVX2: # %bb.0:
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
;
; AVX2-LABEL: trunc_add_const_v16i64_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
;
; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsubq %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2
; AVX2-NEXT: vpsubq %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
; AVX2: # %bb.0:
; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
;
; AVX2-LABEL: trunc_sub_const_v16i64_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
;
; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2
; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
; AVX2: # %bb.0:
; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
; AVX2: # %bb.0:
; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
; AVX2: # %bb.0:
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
;
; AVX2-LABEL: trunc_and_const_v16i64_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
;
; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
;
; AVX2-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
;
; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpor %ymm6, %ymm2, %ymm2
; AVX2-NEXT: vpor %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
; AVX2: # %bb.0:
; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
;
; AVX2-LABEL: trunc_or_const_v16i64_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
;
; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
;
; AVX2-LABEL: trunc8i64_8i8:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
;
; AVX2-LABEL: trunc16i32_16i8:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
;
; AVX2-LABEL: trunc32i16_32i8:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
;
; AVX2-LABEL: trunc2x16i16_32i8:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm9
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [17179869184,17179869184,17179869184,17179869184]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4]
; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm8
; AVX2-NEXT: vpermd %ymm8, %ymm6, %ymm8
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm3
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040]
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm3
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm3
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm3
; AVX512F-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX512F-NEXT: vpternlogd $202, (%rdi){1to8}, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040]
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX512DQ-NEXT: vpternlogd $202, (%rdi){1to8}, %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpternlogq $202, (%rdi){1to4}, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX512DQ-NEXT: vpternlogq $202, (%rdi){1to4}, %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm3
; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040]
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm3
; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm3
; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm3
; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2