Adapt the existing ANY/ZERO_EXTEND_VECTOR_INREG shuffle matching to also recognise SIGN_EXTEND_VECTOR_INREG patterns to handle cases where we're effectively "splatting" all-signbits sources.
}
/// Return true if every element in Mask is the undef sentinel value or equal to
-/// the specified value..
+/// the specified value.
static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
return llvm::all_of(Mask, [CmpVal](int M) {
return (M == SM_SentinelUndef) || (M == CmpVal);
});
}
+/// Return true if every element in Mask, beginning from position Pos and ending
+/// in Pos+Size is the undef sentinel value or equal to the specified value.
+static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
+ unsigned Size) {
+ return llvm::all_of(Mask.slice(Pos, Size),
+ [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
+}
+
/// Val is either the undef or zero sentinel value.
static bool isUndefOrZero(int Val) {
return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
}
}
- // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
+ // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
unsigned MaxScale = 64 / MaskEltSize;
+ bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
+ DAG.ComputeNumSignBits(V1) == MaskEltSize;
for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
bool MatchAny = true;
bool MatchZero = true;
+ bool MatchSign = UseSign;
unsigned NumDstElts = NumMaskElts / Scale;
- for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
+ for (unsigned i = 0;
+ i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
- MatchAny = MatchZero = false;
+ MatchAny = MatchSign = MatchZero = false;
break;
}
- MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
- MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
- }
- if (MatchAny || MatchZero) {
- assert(MatchZero && "Failed to match zext but matched aext?");
+ unsigned Pos = (i * Scale) + 1;
+ unsigned Len = Scale - 1;
+ MatchAny &= isUndefInRange(Mask, Pos, Len);
+ MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
+ MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
+ }
+ if (MatchAny || MatchSign || MatchZero) {
+ assert((MatchSign || MatchZero) &&
+ "Failed to match sext/zext but matched aext?");
unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
- MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
- MVT::getIntegerVT(MaskEltSize);
+ MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
+ : MVT::getIntegerVT(MaskEltSize);
SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
- Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
+ Shuffle = unsigned(
+ MatchAny ? ISD::ANY_EXTEND
+ : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
if (SrcVT.getVectorNumElements() != NumDstElts)
Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
}
define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
-; CHECK-SSE-LABEL: t3_wide:
-; CHECK-SSE: # %bb.0:
-; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411]
-; CHECK-SSE-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE-NEXT: psrlq $32, %xmm3
-; CHECK-SSE-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT: paddq %xmm3, %xmm0
-; CHECK-SSE-NEXT: psllq $32, %xmm0
-; CHECK-SSE-NEXT: paddq %xmm2, %xmm0
-; CHECK-SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; CHECK-SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT: pand %xmm2, %xmm1
-; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE-NEXT: por %xmm1, %xmm0
-; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm1
-; CHECK-SSE-NEXT: pxor %xmm0, %xmm1
-; CHECK-SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
-; CHECK-SSE-NEXT: retq
+; CHECK-SSE2-LABEL: t3_wide:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
+; CHECK-SSE2-NEXT: psrlq $32, %xmm3
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: paddq %xmm3, %xmm0
+; CHECK-SSE2-NEXT: psllq $32, %xmm0
+; CHECK-SSE2-NEXT: paddq %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
+; CHECK-SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: por %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm0, %xmm1
+; CHECK-SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: t3_wide:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411]
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm3
+; CHECK-SSE41-NEXT: psrlq $32, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddq %xmm3, %xmm0
+; CHECK-SSE41-NEXT: psllq $32, %xmm0
+; CHECK-SSE41-NEXT: paddq %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
+; CHECK-SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pand %xmm2, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pxor %xmm0, %xmm1
+; CHECK-SSE41-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
+; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: t3_wide:
; CHECK-AVX1: # %bb.0:
; SSE41-NEXT: movdqa %xmm3, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pmovsxdq %xmm3, %xmm0
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm3, %xmm5
; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT: pmovsxdq %xmm4, %xmm0
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm4
; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
; SSE41-NEXT: pcmpgtd %xmm5, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pmovsxdq %xmm2, %xmm0
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: pcmpeqd %xmm9, %xmm3
; SSE41-NEXT: pcmpgtd %xmm9, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pmovsxdq %xmm2, %xmm0
; SSE41-NEXT: pand %xmm3, %xmm0
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
; SSE41-NEXT: movdqa %xmm3, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pmovsxdq %xmm3, %xmm0
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pmovsxdq %xmm3, %xmm0
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm5, %xmm2
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE41-NEXT: pmovsxdq %xmm5, %xmm0
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movdqa %xmm9, %xmm2
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pmovsxdq %xmm9, %xmm0
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
; SSE41-NEXT: movdqa %xmm3, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pmovsxdq %xmm3, %xmm0
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm3, %xmm5
; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT: pmovsxdq %xmm4, %xmm0
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm4
; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
; SSE41-NEXT: pcmpgtd %xmm5, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pmovsxdq %xmm2, %xmm0
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: pcmpeqd %xmm9, %xmm3
; SSE41-NEXT: pcmpgtd %xmm9, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pmovsxdq %xmm2, %xmm0
; SSE41-NEXT: pand %xmm3, %xmm0
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
; SSE41-NEXT: movdqa %xmm3, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pmovsxdq %xmm3, %xmm0
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pmovsxdq %xmm3, %xmm0
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm5, %xmm2
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE41-NEXT: pmovsxdq %xmm5, %xmm0
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movdqa %xmm9, %xmm2
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pmovsxdq %xmm9, %xmm0
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512F-ONLY-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512F-ONLY-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX512F-ONLY-NEXT: vptestmd %xmm0, %xmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %xmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa %xmm0, (%rdx)
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %xmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512DQ-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX512DQ-NEXT: vpmovd2m %xmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %xmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
; AVX512BW-NEXT: kmovq (%rdi), %k1
; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512BW-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX512BW-NEXT: vptestmd %xmm0, %xmm0, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %xmm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx)
}
define void @mask_replication_factor2_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
-; AVX512F-SLOW-LABEL: mask_replication_factor2_vf4:
-; AVX512F-SLOW: # %bb.0:
-; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1
-; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1
-; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
-; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
-; AVX512F-SLOW-NEXT: vzeroupper
-; AVX512F-SLOW-NEXT: retq
-;
-; AVX512F-FAST-LABEL: mask_replication_factor2_vf4:
-; AVX512F-FAST: # %bb.0:
-; AVX512F-FAST-NEXT: kmovw (%rdi), %k1
-; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
-; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1
-; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
-; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx)
-; AVX512F-FAST-NEXT: vzeroupper
-; AVX512F-FAST-NEXT: retq
-;
-; AVX512DQ-SLOW-LABEL: mask_replication_factor2_vf4:
-; AVX512DQ-SLOW: # %bb.0:
-; AVX512DQ-SLOW-NEXT: kmovw (%rdi), %k0
-; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0
-; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX512DQ-SLOW-NEXT: vpmovd2m %ymm0, %k1
-; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
-; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
-; AVX512DQ-SLOW-NEXT: vzeroupper
-; AVX512DQ-SLOW-NEXT: retq
-;
-; AVX512DQ-FAST-LABEL: mask_replication_factor2_vf4:
-; AVX512DQ-FAST: # %bb.0:
-; AVX512DQ-FAST-NEXT: kmovw (%rdi), %k0
-; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
-; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k1
-; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
-; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx)
-; AVX512DQ-FAST-NEXT: vzeroupper
-; AVX512DQ-FAST-NEXT: retq
-;
-; AVX512BW-SLOW-LABEL: mask_replication_factor2_vf4:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: kmovq (%rdi), %k1
-; AVX512BW-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512BW-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX512BW-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1
-; AVX512BW-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
-; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: mask_replication_factor2_vf4:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: kmovq (%rdi), %k1
-; AVX512BW-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512BW-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
-; AVX512BW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512BW-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1
-; AVX512BW-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
-; AVX512BW-FAST-NEXT: vmovdqa %ymm0, (%rdx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512F-ONLY-LABEL: mask_replication_factor2_vf4:
+; AVX512F-ONLY: # %bb.0:
+; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512F-ONLY-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1
+; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX512F-ONLY-NEXT: vzeroupper
+; AVX512F-ONLY-NEXT: retq
;
-; AVX512VBMI-SLOW-LABEL: mask_replication_factor2_vf4:
-; AVX512VBMI-SLOW: # %bb.0:
-; AVX512VBMI-SLOW-NEXT: kmovq (%rdi), %k1
-; AVX512VBMI-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512VBMI-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512VBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VBMI-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX512VBMI-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1
-; AVX512VBMI-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
-; AVX512VBMI-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
-; AVX512VBMI-SLOW-NEXT: vzeroupper
-; AVX512VBMI-SLOW-NEXT: retq
+; AVX512DQ-LABEL: mask_replication_factor2_vf4:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: kmovw (%rdi), %k0
+; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0
+; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX512DQ-NEXT: vpmovd2m %ymm0, %k1
+; AVX512DQ-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VBMI-FAST-LABEL: mask_replication_factor2_vf4:
-; AVX512VBMI-FAST: # %bb.0:
-; AVX512VBMI-FAST-NEXT: kmovq (%rdi), %k1
-; AVX512VBMI-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512VBMI-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
-; AVX512VBMI-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VBMI-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1
-; AVX512VBMI-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
-; AVX512VBMI-FAST-NEXT: vmovdqa %ymm0, (%rdx)
-; AVX512VBMI-FAST-NEXT: vzeroupper
-; AVX512VBMI-FAST-NEXT: retq
+; AVX512BW-LABEL: mask_replication_factor2_vf4:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: kmovq (%rdi), %k1
+; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512BW-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX512BW-NEXT: vptestmd %ymm0, %ymm0, %k1
+; AVX512BW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1
; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512F-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512DQ-SLOW: # %bb.0:
; AVX512DQ-SLOW-NEXT: kmovw (%rdi), %k0
; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0
-; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512DQ-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512DQ-SLOW-NEXT: vpmovd2m %ymm0, %k1
; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512BW-SLOW-NEXT: kmovq (%rdi), %k1
; AVX512BW-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512BW-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512BW-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512BW-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512BW-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
; AVX512VBMI-SLOW-NEXT: kmovq (%rdi), %k1
; AVX512VBMI-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VBMI-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512VBMI-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512VBMI-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX512VBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512VBMI-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512VBMI-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}