return true;
}
+/// Return true if Val falls within the specified range (L, H].
+static bool isInRange(int Val, int Low, int Hi) {
+ return (Val >= Low && Val < Hi);
+}
+
+/// Return true if the value of any element in Mask falls within the specified
+/// range (L, H].
+static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
+ for (int M : Mask)
+ if (isInRange(M, Low, Hi))
+ return true;
+ return false;
+}
+
/// Return true if Val is undef or if its value falls within the
/// specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {
- return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
+ return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
}
/// Return true if every element in Mask is undef or if its value
/// Return true if Val is undef, zero or if its value falls within the
/// specified range (L, H].
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
- return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
+ return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
}
/// Return true if every element in Mask is undef, zero or if its value
}
/// Return true if every element in Mask, beginning
-/// from position Pos and ending in Pos+Size, falls within the specified
-/// sequential range (Low, Low+Size]. or is undef.
-static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
- unsigned Pos, unsigned Size, int Low) {
- for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+/// from position Pos and ending in Pos + Size, falls within the specified
+/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
+static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
+ unsigned Size, int Low, int Step = 1) {
+ for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
if (!isUndefOrEqual(Mask[i], Low))
return false;
return true;
return SDValue();
}
+static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
+ int Delta) {
+ int Size = (int)Mask.size();
+ int Split = Size / Delta;
+ int TruncatedVectorStart = SwappedOps ? Size : 0;
+
+ // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
+ if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
+ return false;
+
+ // The rest of the mask should not refer to the truncated vector's elements.
+ if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
+ TruncatedVectorStart + Size))
+ return false;
+
+ return true;
+}
+
+// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
+//
+// An example is the following:
+//
+// t0: ch = EntryToken
+// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
+// t25: v4i32 = truncate t2
+// t41: v8i16 = bitcast t25
+// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
+// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
+// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
+// t18: v2i64 = bitcast t51
+//
+// Without avx512vl, this is lowered to:
+//
+// vpmovqd %zmm0, %ymm0
+// vpshufb {{.*#+}} xmm0 =
+// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+//
+// But when avx512vl is available, one can just use a single vpmovdw
+// instruction.
+static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (VT != MVT::v16i8 && VT != MVT::v8i16)
+ return SDValue();
+
+ if (Mask.size() != VT.getVectorNumElements())
+ return SDValue();
+
+ bool SwappedOps = false;
+
+ if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
+ if (!ISD::isBuildVectorAllZeros(V1.getNode()))
+ return SDValue();
+
+ std::swap(V1, V2);
+ SwappedOps = true;
+ }
+
+ // Look for:
+ //
+ // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
+ // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
+ //
+ // and similar ones.
+ if (V1.getOpcode() != ISD::BITCAST)
+ return SDValue();
+ if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ SDValue Src = V1.getOperand(0).getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+
+ // The vptrunc** instructions truncating 128 bit and 256 bit vectors
+ // are only available with avx512vl.
+ if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
+ return SDValue();
+
+ // Down Convert Word to Byte is only available with avx512bw. The case with
+ // 256-bit output doesn't contain a shuffle and is therefore not handled here.
+ if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
+ !Subtarget.hasBWI())
+ return SDValue();
+
+ // The first half/quarter of the mask should refer to every second/fourth
+ // element of the vector truncated and bitcasted.
+ if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&
+ !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))
+ return SDValue();
+
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
+}
+
// X86 has dedicated pack instructions that can handle specific truncation
// operations: PACKSS and PACKUS.
static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
if (canonicalizeShuffleMaskWithCommute(Mask))
return DAG.getCommutedVectorShuffle(*SVOp);
+ if (SDValue V =
+ lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
+ return V;
+
// For each vector width, delegate to a specialized lowering routine.
if (VT.is128BitVector())
return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
;
; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
;
; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated.vec = trunc <8 x i32> %vec to <8 x i8>
;
; AVX512VL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
;
; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated = trunc <8 x i32> %vec to <8 x i8>
;
; AVX512VL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
;
; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated = trunc <8 x i32> %vec to <8 x i16>
;
; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
;
; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated = trunc <8 x i32> %vec to <8 x i8>
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated = trunc <4 x i64> %vec to <4 x i16>
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated = trunc <4 x i64> %vec to <4 x i16>
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated = trunc <4 x i64> %vec to <4 x i32>
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated = trunc <4 x i64> %vec to <4 x i16>
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
+; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
+; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
+; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated = trunc <4 x i64> %vec to <4 x i8>