lowerShuffleWithVPMOV currently only matches shuffle(truncate(x)) patterns, but on VLX targets the truncate isn't usually necessary to make the VPMOV node worthwhile (as we're only targetting v16i8/v8i16 shuffles we're almost always ending up with a PSHUFB node instead). PACKSS/PACKUS are still preferred vs VPMOV due to their lower uop count.
Fixes the remaining regression from the fixes in rG293899c64b75
unsigned EltSizeInBits = VT.getScalarSizeInBits();
unsigned MaxScale = 64 / EltSizeInBits;
for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+ unsigned SrcEltBits = EltSizeInBits * Scale;
unsigned NumSrcElts = NumElts / Scale;
unsigned UpperElts = NumElts - NumSrcElts;
if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
continue;
+ // Attempt to find a matching source truncation, but as a fall back VLX
+ // cases can use the VPMOV directly.
SDValue Src = peekThroughBitcasts(V1);
- if (Src.getOpcode() != ISD::TRUNCATE ||
- Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
+ if (Src.getOpcode() == ISD::TRUNCATE &&
+ Src.getScalarValueSizeInBits() == SrcEltBits) {
+ Src = Src.getOperand(0);
+ } else if (Subtarget.hasVLX()) {
+ MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
+ MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
+ Src = DAG.getBitcast(SrcVT, Src);
+ // Don't do this if PACKSS/PACKUS could perform it cheaper.
+ if (Scale == 2 &&
+ ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
+ (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
+ return SDValue();
+ } else
return SDValue();
- Src = Src.getOperand(0);
// VPMOVWB is only available with avx512bw.
- MVT SrcVT = Src.getSimpleValueType();
- if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
- !Subtarget.hasBWI())
+ if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
return SDValue();
bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
;
; SKX-LABEL: trunc_qw_128:
; SKX: ## %bb.0:
-; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; SKX-NEXT: vpmovqw %xmm0, %xmm0
; SKX-NEXT: retq
%x = trunc <2 x i64> %i to <2 x i16>
ret <2 x i16> %x
define <2 x half> @test_u1tofp2(<2 x i1> %arg0) {
; CHECK-LABEL: test_u1tofp2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; CHECK-NEXT: vpmovqw %xmm0, %xmm0
; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vcvtuw2ph %xmm0, %xmm0
; CHECK-NEXT: retq
; AVX512-LABEL: vf4:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpmovdw %xmm0, %xmm1
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovq %xmm1, (%rsi)
-; AVX512-NEXT: vmovq %xmm0, (%rdx)
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512-NEXT: vmovq %xmm1, (%rdx)
; AVX512-NEXT: retq
%wide.vec = load <8 x i16>, ptr %in.vec, align 32
; AVX512-LABEL: vf2:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpmovqw %xmm0, %xmm1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX512-NEXT: vmovd %xmm1, (%rsi)
-; AVX512-NEXT: vmovd %xmm2, (%rdx)
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpmovqw %xmm0, (%rsi)
+; AVX512-NEXT: vmovd %xmm1, (%rdx)
; AVX512-NEXT: vmovd %xmm3, (%rcx)
-; AVX512-NEXT: vmovd %xmm0, (%r8)
+; AVX512-NEXT: vmovd %xmm2, (%r8)
; AVX512-NEXT: retq
%wide.vec = load <8 x i16>, ptr %in.vec, align 32
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX-NEXT: retq
;
-; AVX512-LABEL: rot16_trunc:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $11, %xmm0, %xmm1
-; AVX512-NEXT: vpslld $5, %xmm0, %xmm0
-; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512-NEXT: retq
+; AVX512NOVLX-LABEL: rot16_trunc:
+; AVX512NOVLX: # %bb.0:
+; AVX512NOVLX-NEXT: vpsrld $11, %xmm0, %xmm1
+; AVX512NOVLX-NEXT: vpslld $5, %xmm0, %xmm0
+; AVX512NOVLX-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512NOVLX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512NOVLX-NEXT: retq
+;
+; AVX512VLX-LABEL: rot16_trunc:
+; AVX512VLX: # %bb.0:
+; AVX512VLX-NEXT: vpsrld $11, %xmm0, %xmm1
+; AVX512VLX-NEXT: vpslld $5, %xmm0, %xmm0
+; AVX512VLX-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512VLX-NEXT: vpmovdw %xmm0, %xmm0
+; AVX512VLX-NEXT: retq
;
; XOP-LABEL: rot16_trunc:
; XOP: # %bb.0: