From 8b53fdd3b659283c3e048668e266945b44470771 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 3 May 2020 22:35:32 -0700 Subject: [PATCH] [X86] Custom legalize v16i64->v16i8 truncate with avx512. Default legalization will create two v8i64 truncs to v8i32, concat them to v16i32, and then truncate the rest of the way to v16i8. Instead we can truncate directly from v8i64 to v8i8 in the lower half of an xmm. Then concat the two halves to use vpunpcklqdq. This is the same number of uops, but the dependency chain through the uops is better since the halves are merged at the end. I had to had SimplifyDemandedBits support for VTRUNC to prevent a regression on vector-trunc-math.ll. combineTruncatedArithmetic no longer gets a chance to shrink vXi64 mul so we were producing the v8i64 multiply sequence using multiple PMULUDQs. With the demanded bits fix we are able to prune out the extra ops leaving just two PMULUDQs, one for each v8i64 half. This is twice the width of the 2 v8i32 PMULLDs we had before, but PMULUDQ is 1 uop and PMULLD is 2. We also save some truncates. It's probably worth using PMULUDQ even when PMULLQ is available since the latter is 3 uops, but that will require a different change. Differential Revision: https://reviews.llvm.org/D79231 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 26 ++++- llvm/test/CodeGen/X86/vector-trunc-math.ll | 154 ++++++++++++++------------- llvm/test/CodeGen/X86/vector-trunc-packus.ll | 61 ++--------- llvm/test/CodeGen/X86/vector-trunc-ssat.ll | 58 ++-------- llvm/test/CodeGen/X86/vector-trunc-usat.ll | 52 ++------- 5 files changed, 129 insertions(+), 222 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6d08546..ae279e2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1528,6 +1528,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal); setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); @@ -20414,7 +20415,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { if (!TLI.isTypeLegal(InVT)) { if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) && VT.is128BitVector()) { - assert(Subtarget.hasVLX() && "Unexpected subtarget!"); + assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) && + "Unexpected subtarget!"); // The default behavior is to truncate one step, concatenate, and then // truncate the remainder. We'd rather produce two 64-bit results and // concatenate those. @@ -36957,6 +36959,18 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( unsigned BitWidth = OriginalDemandedBits.getBitWidth(); unsigned Opc = Op.getOpcode(); switch(Opc) { + case X86ISD::VTRUNC: { + KnownBits KnownOp; + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + + // Simplify the input, using demanded bit information. + APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits()); + APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements()); + if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1)) + return true; + break; + } case X86ISD::PMULDQ: case X86ISD::PMULUDQ: { // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element. @@ -43747,7 +43761,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, return combineVectorTruncation(N, DAG, Subtarget); } -static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) { +static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); SDLoc DL(N); @@ -43757,6 +43772,11 @@ static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) { if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits())); + if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI)) + return SDValue(N, 0); + return SDValue(); } @@ -47495,7 +47515,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget); case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); - case X86ISD::VTRUNC: return combineVTRUNC(N, DAG); + case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI); case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget); case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll index 80e33a7a..ecfc5e6 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -300,12 +300,11 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX512-LABEL: trunc_add_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1 ; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <16 x i64> %a0, %a1 @@ -731,10 +730,9 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX512-LABEL: trunc_add_const_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1146,12 +1144,11 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX512-LABEL: trunc_sub_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1 ; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <16 x i64> %a0, %a1 @@ -1545,10 +1542,9 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX512-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -2079,38 +2075,31 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX512F-LABEL: trunc_mul_v16i64_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovqd %zmm3, %ymm3 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vpmulld %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v16i64_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovqd %zmm3, %ymm3 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vpmulld %ymm3, %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = mul <16 x i64> %a0, %a1 @@ -2587,16 +2576,35 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; -; AVX512-LABEL: trunc_mul_const_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmuludq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpmuludq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512F-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmuludq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmuludq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %1 = mul <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> ret <16 x i8> %2 @@ -3024,12 +3032,11 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX512-LABEL: trunc_and_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 ; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <16 x i64> %a0, %a1 @@ -3396,10 +3403,9 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX512-LABEL: trunc_and_const_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -3789,12 +3795,11 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX512-LABEL: trunc_xor_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1 ; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <16 x i64> %a0, %a1 @@ -4161,10 +4166,9 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX512-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -4554,12 +4558,11 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind ; ; AVX512-LABEL: trunc_or_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1 ; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <16 x i64> %a0, %a1 @@ -4926,10 +4929,9 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX512-LABEL: trunc_or_const_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index 05e1638..8a0310b 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -5033,57 +5033,16 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64>* %p0) "min-legal-vector-w ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_packus_v16i64_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpminsq (%rdi), %zmm0, %zmm1 -; AVX512F-NEXT: vpminsq 64(%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_packus_v16i64_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmaxsq 64(%rdi), %zmm0, %zmm1 -; AVX512VL-NEXT: vpmovusqb %zmm1, %xmm1 -; AVX512VL-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovusqb %zmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_packus_v16i64_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpminsq (%rdi), %zmm0, %zmm1 -; AVX512BW-NEXT: vpminsq 64(%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_packus_v16i64_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsq 64(%rdi), %zmm0, %zmm1 -; AVX512BWVL-NEXT: vpmovusqb %zmm1, %xmm1 -; AVX512BWVL-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovusqb %zmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_packus_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsq 64(%rdi), %zmm0, %zmm1 +; AVX512-NEXT: vpmovusqb %zmm1, %xmm1 +; AVX512-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 +; AVX512-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_packus_v16i64_v16i8: ; SKX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index 8898d66..34ab8ac 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -4835,55 +4835,15 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64>* %p0) "min-legal-vector-wid ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_ssat_v16i64_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpminsq (%rdi), %zmm0, %zmm1 -; AVX512F-NEXT: vpminsq 64(%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_ssat_v16i64_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VL-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512VL-NEXT: vpmovsqb %zmm1, %xmm1 -; AVX512VL-NEXT: vpmovsqb %zmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_ssat_v16i64_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm0 = [127,127,127,127,127,127,127,127] -; AVX512BW-NEXT: vpminsq (%rdi), %zmm0, %zmm1 -; AVX512BW-NEXT: vpminsq 64(%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_ssat_v16i64_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BWVL-NEXT: vpmovsqb %zmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovsqb %zmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_ssat_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-NEXT: vpmovsqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v16i64_v16i8: ; SKX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index 02af176..df691c5 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -3543,49 +3543,15 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64>* %p0) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_usat_v16i64_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpminuq 64(%rdi), %zmm0, %zmm1 -; AVX512F-NEXT: vpminuq (%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_usat_v16i64_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VL-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512VL-NEXT: vpmovusqb %zmm1, %xmm1 -; AVX512VL-NEXT: vpmovusqb %zmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_usat_v16i64_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpminuq 64(%rdi), %zmm0, %zmm1 -; AVX512BW-NEXT: vpminuq (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_usat_v16i64_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BWVL-NEXT: vpmovusqb %zmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovusqb %zmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_usat_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-NEXT: vpmovusqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_usat_v16i64_v16i8: ; SKX: # %bb.0: -- 2.7.4