From: Craig Topper Date: Mon, 26 Nov 2018 21:12:39 +0000 (+0000) Subject: [LegalizeVectorTypes][X86][ARM][AArch64][PowerPC] Don't use SplitVecOp_TruncateHelper... X-Git-Tag: llvmorg-8.0.0-rc1~3580 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b955bf382cf3ad82c4e8849e898bdfba40ab8aab;p=platform%2Fupstream%2Fllvm.git [LegalizeVectorTypes][X86][ARM][AArch64][PowerPC] Don't use SplitVecOp_TruncateHelper for FP_TO_SINT/UINT. SplitVecOp_TruncateHelper tries to promote the result type while splitting FP_TO_SINT/UINT. It then concatenates the result and introduces a truncate to the original result type. But it does this without inserting the AssertZExt/AssertSExt that the regular result type promotion would insert. Nor does it turn FP_TO_UINT into FP_TO_SINT the way normal result type promotion for these operations does. This is bad on X86 which doesn't support FP_TO_SINT until AVX512. This patch disables the use of SplitVecOp_TruncateHelper for these operations and just lets normal promotion handle it. I've tweaked a couple things in X86ISelLowering to avoid a few obvious regressions there. I believe all the changes on X86 are improvements. The other targets look neutral. Differential Revision: https://reviews.llvm.org/D54906 llvm-svn: 347593 --- diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index f34f01c..2809fca 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1694,13 +1694,6 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::VSELECT: Res = SplitVecOp_VSELECT(N, OpNo); break; - case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: - if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType())) - Res = SplitVecOp_TruncateHelper(N); - else - Res = SplitVecOp_UnaryOp(N); - break; case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType())) @@ -1708,6 +1701,8 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { else Res = SplitVecOp_UnaryOp(N); break; + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: case ISD::CTTZ: case ISD::CTLZ: case ISD::CTPOP: diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9351a7c..e31f2a6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -909,6 +909,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); + // By marking FP_TO_SINT v8i16 as Custom, will trick type legalization into + // promoting v8i8 FP_TO_UINT into FP_TO_SINT. When the v8i16 FP_TO_SINT is + // split again based on the input type, this will cause an AssertSExt i16 to + // be emitted instead of an AssertZExt. This will allow packssdw followed by + // packuswb to be used to truncate to v8i8. This is necessary since packusdw + // isn't available until sse4.1. + setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); @@ -26458,11 +26466,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U); MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth), VT.getVectorNumElements()); - unsigned Opc = N->getOpcode(); - if (PromoteVT == MVT::v2i32 || PromoteVT == MVT::v4i32) - Opc = ISD::FP_TO_SINT; - - SDValue Res = DAG.getNode(Opc, dl, PromoteVT, Src); + SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src); // Preserve what we know about the size of the original result. Except // when the result is v2i32 since we can't widen the assert. diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll index b9dbfc7..ba7bdc4 100644 --- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll +++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll @@ -2,30 +2,30 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(<4 x double>* %ptr) { -; CHECK: fptosi_v4f64_to_v4i16 +; CHECK-LABEL: fptosi_v4f64_to_v4i16 ; CHECK-DAG: fcvtzs v[[LHS:[0-9]+]].2d, v0.2d ; CHECK-DAG: fcvtzs v[[RHS:[0-9]+]].2d, v1.2d -; CHECK-DAG: xtn v[[MID:[0-9]+]].2s, v[[LHS]].2d -; CHECK-DAG: xtn2 v[[MID]].4s, v[[RHS]].2d -; CHECK: xtn v0.4h, v[[MID]].4s +; CHECK-DAG: xtn v[[XTN0:[0-9]+]].2s, v[[LHS]].2d +; CHECK-DAG: xtn v[[XTN1:[0-9]+]].2s, v[[RHS]].2d +; CHECK: uzp1 v0.4h, v[[XTN1]].4h, v[[XTN0]].4h %tmp1 = load <4 x double>, <4 x double>* %ptr %tmp2 = fptosi <4 x double> %tmp1 to <4 x i16> ret <4 x i16> %tmp2 } define <8 x i8> @fptosi_v4f64_to_v4i8(<8 x double>* %ptr) { -; CHECK: fptosi_v4f64_to_v4i8 +; CHECK-LABEL: fptosi_v4f64_to_v4i8 ; CHECK-DAG: fcvtzs v[[CONV0:[0-9]+]].2d, v0.2d ; CHECK-DAG: fcvtzs v[[CONV1:[0-9]+]].2d, v1.2d ; CHECK-DAG: fcvtzs v[[CONV2:[0-9]+]].2d, v2.2d ; CHECK-DAG: fcvtzs v[[CONV3:[0-9]+]].2d, v3.2d -; CHECK-DAG: xtn v[[NA2:[0-9]+]].2s, v[[CONV2]].2d -; CHECK-DAG: xtn2 v[[NA2]].4s, v[[CONV3]].2d -; CHECK-DAG: xtn v[[NA0:[0-9]+]].2s, v[[CONV0]].2d -; CHECK-DAG: xtn2 v[[NA0]].4s, v[[CONV1]].2d -; CHECK-DAG: xtn v[[TMP1:[0-9]+]].4h, v[[NA2]].4s -; CHECK-DAG: xtn2 v[[TMP1]].8h, v[[NA0]].4s -; CHECK: xtn v0.8b, v[[TMP1]].8h +; CHECK-DAG: xtn v[[XTN0:[0-9]+]].2s, v[[CONV0]].2d +; CHECK-DAG: xtn v[[XTN1:[0-9]+]].2s, v[[CONV1]].2d +; CHECK-DAG: xtn v[[XTN2:[0-9]+]].2s, v[[CONV2]].2d +; CHECK-DAG: xtn v[[XTN3:[0-9]+]].2s, v[[CONV3]].2d +; CHECK-DAG: uzp1 v[[UZP0:[0-9]+]].4h, v[[XTN1]].4h, v[[XTN0]].4h +; CHECK-DAG: uzp1 v[[UZP1:[0-9]+]].4h, v[[XTN3]].4h, v[[XTN2]].4h +; CHECK: uzp1 v0.8b, v[[UZP1:[0-9]+]].8b, v[[UZP0:[0-9]+]].8b %tmp1 = load <8 x double>, <8 x double>* %ptr %tmp2 = fptosi <8 x double> %tmp1 to <8 x i8> ret <8 x i8> %tmp2 @@ -54,12 +54,12 @@ define <4 x i16> @trunc_v4i64_to_v4i16(<4 x i64>* %ptr) { } define <4 x i16> @fptoui_v4f64_to_v4i16(<4 x double>* %ptr) { -; CHECK: fptoui_v4f64_to_v4i16 -; CHECK-DAG: fcvtzu v[[LHS:[0-9]+]].2d, v0.2d -; CHECK-DAG: fcvtzu v[[RHS:[0-9]+]].2d, v1.2d -; CHECK-DAG: xtn v[[MID:[0-9]+]].2s, v[[LHS]].2d -; CHECK-DAG: xtn2 v[[MID]].4s, v[[RHS]].2d -; CHECK: xtn v0.4h, v[[MID]].4s +; CHECK-LABEL: fptoui_v4f64_to_v4i16 +; CHECK-DAG: fcvtzs v[[LHS:[0-9]+]].2d, v0.2d +; CHECK-DAG: fcvtzs v[[RHS:[0-9]+]].2d, v1.2d +; CHECK-DAG: xtn v[[XTN0:[0-9]+]].2s, v[[LHS]].2d +; CHECK-DAG: xtn v[[XTN1:[0-9]+]].2s, v[[RHS]].2d +; CHECK: uzp1 v0.4h, v[[XTN1]].4h, v[[XTN0]].4h %tmp1 = load <4 x double>, <4 x double>* %ptr %tmp2 = fptoui <4 x double> %tmp1 to <4 x i16> ret <4 x i16> %tmp2 diff --git a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll index b6e25cf..823fe44 100644 --- a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll +++ b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll @@ -2,14 +2,14 @@ define <8 x i8> @float_to_i8(<8 x float>* %in) { ; CHECK-LABEL: float_to_i8: -; CHECK: ldp q1, q0, [x0] -; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v1.4s, v1.4s -; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v0.4s, v0.4s -; CHECK-DAG: fcvtzu v[[LSB2:[0-9]+]].4s, v[[LSB]].4s -; CHECK-DAG: fcvtzu v[[MSB2:[0-9]+]].4s, v[[MSB]].4s +; CHECK: ldp q0, q1, [x0] +; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v0.4s, v0.4s +; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v1.4s, v1.4s +; CHECK-DAG: fcvtzs v[[LSB2:[0-9]+]].4s, v[[LSB]].4s +; CHECK-DAG: fcvtzs v[[MSB2:[0-9]+]].4s, v[[MSB]].4s ; CHECK-DAG: xtn v[[TMP:[0-9]+]].4h, v[[LSB]].4s -; CHECK-DAG: xtn2 v[[TMP]].8h, v[[MSB]].4s -; CHECK-DAG: xtn v0.8b, v[[TMP]].8h +; CHECK-DAG: xtn v[[TMP2:[0-9]+]].4h, v[[MSB]].4s +; CHECK-DAG: uzp1 v0.8b, v[[TMP]].8b, v[[TMP2]].8b %l = load <8 x float>, <8 x float>* %in %scale = fmul <8 x float> %l, %conv = fptoui <8 x float> %scale to <8 x i8> diff --git a/llvm/test/CodeGen/ARM/vcvt.ll b/llvm/test/CodeGen/ARM/vcvt.ll index 7052607..f16c8dc 100644 --- a/llvm/test/CodeGen/ARM/vcvt.ll +++ b/llvm/test/CodeGen/ARM/vcvt.ll @@ -293,14 +293,14 @@ define <4 x i16> @fix_double_to_i16(<4 x double> %in) { ; CHECK-NEXT: vld1.64 {d16, d17}, [r12] ; CHECK-NEXT: vmov d19, r2, r3 ; CHECK-NEXT: vadd.f64 d18, d18, d18 -; CHECK-NEXT: vcvt.u32.f64 s0, d18 +; CHECK-NEXT: vcvt.s32.f64 s0, d18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vadd.f64 d20, d16, d16 ; CHECK-NEXT: vadd.f64 d19, d19, d19 ; CHECK-NEXT: vadd.f64 d16, d17, d17 -; CHECK-NEXT: vcvt.u32.f64 s2, d20 -; CHECK-NEXT: vcvt.u32.f64 s4, d19 -; CHECK-NEXT: vcvt.u32.f64 s6, d16 +; CHECK-NEXT: vcvt.s32.f64 s2, d20 +; CHECK-NEXT: vcvt.s32.f64 s4, d19 +; CHECK-NEXT: vcvt.s32.f64 s6, d16 ; CHECK-NEXT: vmov.32 d16[0], r0 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov.32 d17[0], r0 @@ -308,7 +308,7 @@ define <4 x i16> @fix_double_to_i16(<4 x double> %in) { ; CHECK-NEXT: vmov.32 d16[1], r0 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: vmov.32 d17[1], r0 -; CHECK-NEXT: vmovn.i32 d16, q8 +; CHECK-NEXT: vuzp.16 d16, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll index b22c6b7..a713b52 100644 --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll @@ -166,19 +166,19 @@ define <8 x i16> @test8elt(<8 x double>* nocapture readonly) local_unnamed_addr ; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 ; CHECK-P8-NEXT: li r4, 48 ; CHECK-P8-NEXT: lxvd2x vs3, r3, r4 -; CHECK-P8-NEXT: xscvdpuxws f4, f0 +; CHECK-P8-NEXT: xscvdpsxws f4, f0 ; CHECK-P8-NEXT: xxswapd vs0, vs0 -; CHECK-P8-NEXT: xscvdpuxws f5, f1 +; CHECK-P8-NEXT: xscvdpsxws f5, f1 ; CHECK-P8-NEXT: xxswapd vs1, vs1 -; CHECK-P8-NEXT: xscvdpuxws f6, f2 +; CHECK-P8-NEXT: xscvdpsxws f6, f2 ; CHECK-P8-NEXT: xxswapd vs2, vs2 -; CHECK-P8-NEXT: xscvdpuxws f7, f3 +; CHECK-P8-NEXT: xscvdpsxws f7, f3 ; CHECK-P8-NEXT: xxswapd vs3, vs3 -; CHECK-P8-NEXT: xscvdpuxws f0, f0 -; CHECK-P8-NEXT: xscvdpuxws f1, f1 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: mfvsrwz r3, f4 -; CHECK-P8-NEXT: xscvdpuxws f2, f2 -; CHECK-P8-NEXT: xscvdpuxws f3, f3 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: xscvdpsxws f3, f3 ; CHECK-P8-NEXT: mfvsrwz r4, f5 ; CHECK-P8-NEXT: mtvsrd f4, r3 ; CHECK-P8-NEXT: mfvsrwz r3, f6 @@ -221,14 +221,14 @@ define <8 x i16> @test8elt(<8 x double>* nocapture readonly) local_unnamed_addr ; CHECK-P9-NEXT: xxswapd vs5, vs2 ; CHECK-P9-NEXT: xxswapd vs6, vs1 ; CHECK-P9-NEXT: xxswapd vs7, vs0 -; CHECK-P9-NEXT: xscvdpuxws f3, f3 -; CHECK-P9-NEXT: xscvdpuxws f2, f2 -; CHECK-P9-NEXT: xscvdpuxws f1, f1 -; CHECK-P9-NEXT: xscvdpuxws f0, f0 -; CHECK-P9-NEXT: xscvdpuxws f4, f4 -; CHECK-P9-NEXT: xscvdpuxws f5, f5 -; CHECK-P9-NEXT: xscvdpuxws f6, f6 -; CHECK-P9-NEXT: xscvdpuxws f7, f7 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: xscvdpsxws f4, f4 +; CHECK-P9-NEXT: xscvdpsxws f5, f5 +; CHECK-P9-NEXT: xscvdpsxws f6, f6 +; CHECK-P9-NEXT: xscvdpsxws f7, f7 ; CHECK-P9-NEXT: mfvsrwz r3, f3 ; CHECK-P9-NEXT: mfvsrwz r5, f2 ; CHECK-P9-NEXT: mfvsrwz r7, f1 @@ -272,14 +272,14 @@ define <8 x i16> @test8elt(<8 x double>* nocapture readonly) local_unnamed_addr ; CHECK-BE-NEXT: xxswapd vs5, vs2 ; CHECK-BE-NEXT: xxswapd vs6, vs1 ; CHECK-BE-NEXT: xxswapd vs7, vs0 -; CHECK-BE-NEXT: xscvdpuxws f3, f3 -; CHECK-BE-NEXT: xscvdpuxws f2, f2 -; CHECK-BE-NEXT: xscvdpuxws f1, f1 -; CHECK-BE-NEXT: xscvdpuxws f0, f0 -; CHECK-BE-NEXT: xscvdpuxws f4, f4 -; CHECK-BE-NEXT: xscvdpuxws f5, f5 -; CHECK-BE-NEXT: xscvdpuxws f6, f6 -; CHECK-BE-NEXT: xscvdpuxws f7, f7 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: xscvdpsxws f0, f0 +; CHECK-BE-NEXT: xscvdpsxws f4, f4 +; CHECK-BE-NEXT: xscvdpsxws f5, f5 +; CHECK-BE-NEXT: xscvdpsxws f6, f6 +; CHECK-BE-NEXT: xscvdpsxws f7, f7 ; CHECK-BE-NEXT: mfvsrwz r3, f3 ; CHECK-BE-NEXT: mfvsrwz r5, f2 ; CHECK-BE-NEXT: mfvsrwz r7, f1 @@ -329,60 +329,60 @@ define void @test16elt(<16 x i16>* noalias nocapture sret %agg.result, <16 x dou ; CHECK-P8-NEXT: li r6, 48 ; CHECK-P8-NEXT: lxvd2x vs3, r4, r6 ; CHECK-P8-NEXT: li r6, 64 -; CHECK-P8-NEXT: xscvdpuxws f4, f0 +; CHECK-P8-NEXT: xscvdpsxws f4, f0 ; CHECK-P8-NEXT: lxvd2x vs5, r4, r6 ; CHECK-P8-NEXT: li r6, 80 ; CHECK-P8-NEXT: xxswapd vs0, vs0 -; CHECK-P8-NEXT: xscvdpuxws f6, f1 +; CHECK-P8-NEXT: xscvdpsxws f6, f1 ; CHECK-P8-NEXT: lxvd2x vs7, r4, r6 ; CHECK-P8-NEXT: li r6, 96 ; CHECK-P8-NEXT: xxswapd vs1, vs1 -; CHECK-P8-NEXT: xscvdpuxws f8, f2 +; CHECK-P8-NEXT: xscvdpsxws f8, f2 ; CHECK-P8-NEXT: lxvd2x vs9, r4, r6 ; CHECK-P8-NEXT: li r6, 112 ; CHECK-P8-NEXT: xxswapd vs2, vs2 -; CHECK-P8-NEXT: xscvdpuxws f10, f3 +; CHECK-P8-NEXT: xscvdpsxws f10, f3 ; CHECK-P8-NEXT: lxvd2x vs11, r4, r6 ; CHECK-P8-NEXT: xxswapd vs3, vs3 -; CHECK-P8-NEXT: xscvdpuxws f12, f5 +; CHECK-P8-NEXT: xscvdpsxws f12, f5 ; CHECK-P8-NEXT: xxswapd vs5, vs5 -; CHECK-P8-NEXT: xscvdpuxws f13, f7 +; CHECK-P8-NEXT: xscvdpsxws f13, f7 ; CHECK-P8-NEXT: xxswapd vs7, vs7 -; CHECK-P8-NEXT: xscvdpuxws v2, f9 +; CHECK-P8-NEXT: xscvdpsxws v2, f9 ; CHECK-P8-NEXT: xxswapd vs9, vs9 ; CHECK-P8-NEXT: mfvsrwz r4, f4 -; CHECK-P8-NEXT: xscvdpuxws v3, f11 +; CHECK-P8-NEXT: xscvdpsxws v3, f11 ; CHECK-P8-NEXT: xxswapd vs11, vs11 -; CHECK-P8-NEXT: xscvdpuxws f0, f0 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mfvsrwz r6, f6 ; CHECK-P8-NEXT: mtvsrd f4, r4 ; CHECK-P8-NEXT: mfvsrwz r4, f8 -; CHECK-P8-NEXT: xscvdpuxws f1, f1 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: xxswapd v4, vs4 -; CHECK-P8-NEXT: xscvdpuxws f2, f2 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 ; CHECK-P8-NEXT: mtvsrd f6, r6 ; CHECK-P8-NEXT: mfvsrwz r6, f10 ; CHECK-P8-NEXT: mtvsrd f8, r4 ; CHECK-P8-NEXT: xxswapd v5, vs6 ; CHECK-P8-NEXT: mfvsrwz r4, f12 -; CHECK-P8-NEXT: xscvdpuxws f5, f5 +; CHECK-P8-NEXT: xscvdpsxws f5, f5 ; CHECK-P8-NEXT: xxswapd v0, vs8 ; CHECK-P8-NEXT: mtvsrd f10, r6 ; CHECK-P8-NEXT: mfvsrwz r6, f13 ; CHECK-P8-NEXT: mtvsrd f12, r4 ; CHECK-P8-NEXT: xxswapd v1, vs10 ; CHECK-P8-NEXT: mfvsrwz r4, v2 -; CHECK-P8-NEXT: xscvdpuxws f3, f3 +; CHECK-P8-NEXT: xscvdpsxws f3, f3 ; CHECK-P8-NEXT: xxswapd v6, vs12 -; CHECK-P8-NEXT: xscvdpuxws f9, f9 +; CHECK-P8-NEXT: xscvdpsxws f9, f9 ; CHECK-P8-NEXT: mtvsrd f13, r6 ; CHECK-P8-NEXT: mfvsrwz r6, v3 ; CHECK-P8-NEXT: mtvsrd v2, r4 ; CHECK-P8-NEXT: xxswapd v7, vs13 ; CHECK-P8-NEXT: mfvsrwz r4, f0 -; CHECK-P8-NEXT: xscvdpuxws f7, f7 +; CHECK-P8-NEXT: xscvdpsxws f7, f7 ; CHECK-P8-NEXT: xxswapd v2, v2 -; CHECK-P8-NEXT: xscvdpuxws f11, f11 +; CHECK-P8-NEXT: xscvdpsxws f11, f11 ; CHECK-P8-NEXT: mtvsrd v3, r6 ; CHECK-P8-NEXT: mfvsrwz r6, f1 ; CHECK-P8-NEXT: mtvsrd f0, r4 @@ -450,22 +450,22 @@ define void @test16elt(<16 x i16>* noalias nocapture sret %agg.result, <16 x dou ; CHECK-P9-NEXT: xxswapd vs13, vs3 ; CHECK-P9-NEXT: xxswapd v2, vs1 ; CHECK-P9-NEXT: xxswapd v3, vs0 -; CHECK-P9-NEXT: xscvdpuxws f6, f6 -; CHECK-P9-NEXT: xscvdpuxws f5, f5 -; CHECK-P9-NEXT: xscvdpuxws f4, f4 -; CHECK-P9-NEXT: xscvdpuxws f2, f2 -; CHECK-P9-NEXT: xscvdpuxws f7, f7 -; CHECK-P9-NEXT: xscvdpuxws f3, f3 -; CHECK-P9-NEXT: xscvdpuxws f1, f1 -; CHECK-P9-NEXT: xscvdpuxws f0, f0 -; CHECK-P9-NEXT: xscvdpuxws f8, f8 -; CHECK-P9-NEXT: xscvdpuxws f9, f9 -; CHECK-P9-NEXT: xscvdpuxws f10, f10 -; CHECK-P9-NEXT: xscvdpuxws f11, f11 -; CHECK-P9-NEXT: xscvdpuxws f12, f12 -; CHECK-P9-NEXT: xscvdpuxws f13, f13 -; CHECK-P9-NEXT: xscvdpuxws v2, v2 -; CHECK-P9-NEXT: xscvdpuxws v3, v3 +; CHECK-P9-NEXT: xscvdpsxws f6, f6 +; CHECK-P9-NEXT: xscvdpsxws f5, f5 +; CHECK-P9-NEXT: xscvdpsxws f4, f4 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: xscvdpsxws f7, f7 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 +; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: xscvdpsxws f8, f8 +; CHECK-P9-NEXT: xscvdpsxws f9, f9 +; CHECK-P9-NEXT: xscvdpsxws f10, f10 +; CHECK-P9-NEXT: xscvdpsxws f11, f11 +; CHECK-P9-NEXT: xscvdpsxws f12, f12 +; CHECK-P9-NEXT: xscvdpsxws f13, f13 +; CHECK-P9-NEXT: xscvdpsxws v2, v2 +; CHECK-P9-NEXT: xscvdpsxws v3, v3 ; CHECK-P9-NEXT: mfvsrwz r4, f6 ; CHECK-P9-NEXT: mfvsrwz r5, f5 ; CHECK-P9-NEXT: mfvsrwz r6, f4 @@ -562,22 +562,22 @@ define void @test16elt(<16 x i16>* noalias nocapture sret %agg.result, <16 x dou ; CHECK-BE-NEXT: xxswapd vs13, vs3 ; CHECK-BE-NEXT: xxswapd v2, vs1 ; CHECK-BE-NEXT: xxswapd v3, vs0 -; CHECK-BE-NEXT: xscvdpuxws f6, f6 -; CHECK-BE-NEXT: xscvdpuxws f5, f5 -; CHECK-BE-NEXT: xscvdpuxws f4, f4 -; CHECK-BE-NEXT: xscvdpuxws f2, f2 -; CHECK-BE-NEXT: xscvdpuxws f7, f7 -; CHECK-BE-NEXT: xscvdpuxws f3, f3 -; CHECK-BE-NEXT: xscvdpuxws f1, f1 -; CHECK-BE-NEXT: xscvdpuxws f0, f0 -; CHECK-BE-NEXT: xscvdpuxws f8, f8 -; CHECK-BE-NEXT: xscvdpuxws f9, f9 -; CHECK-BE-NEXT: xscvdpuxws f10, f10 -; CHECK-BE-NEXT: xscvdpuxws f11, f11 -; CHECK-BE-NEXT: xscvdpuxws f12, f12 -; CHECK-BE-NEXT: xscvdpuxws f13, f13 -; CHECK-BE-NEXT: xscvdpuxws v2, v2 -; CHECK-BE-NEXT: xscvdpuxws v3, v3 +; CHECK-BE-NEXT: xscvdpsxws f6, f6 +; CHECK-BE-NEXT: xscvdpsxws f5, f5 +; CHECK-BE-NEXT: xscvdpsxws f4, f4 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: xscvdpsxws f7, f7 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: xscvdpsxws f0, f0 +; CHECK-BE-NEXT: xscvdpsxws f8, f8 +; CHECK-BE-NEXT: xscvdpsxws f9, f9 +; CHECK-BE-NEXT: xscvdpsxws f10, f10 +; CHECK-BE-NEXT: xscvdpsxws f11, f11 +; CHECK-BE-NEXT: xscvdpsxws f12, f12 +; CHECK-BE-NEXT: xscvdpsxws f13, f13 +; CHECK-BE-NEXT: xscvdpsxws v2, v2 +; CHECK-BE-NEXT: xscvdpsxws v3, v3 ; CHECK-BE-NEXT: mfvsrwz r4, f6 ; CHECK-BE-NEXT: mfvsrwz r5, f5 ; CHECK-BE-NEXT: mfvsrwz r6, f4 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll index 7c4836c..ef7b9c1 100644 --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll @@ -343,60 +343,60 @@ define <16 x i8> @test16elt(<16 x double>* nocapture readonly) local_unnamed_add ; CHECK-P8-NEXT: li r4, 48 ; CHECK-P8-NEXT: lxvd2x vs3, r3, r4 ; CHECK-P8-NEXT: li r4, 64 -; CHECK-P8-NEXT: xscvdpuxws f4, f0 +; CHECK-P8-NEXT: xscvdpsxws f4, f0 ; CHECK-P8-NEXT: xxswapd vs0, vs0 ; CHECK-P8-NEXT: lxvd2x vs5, r3, r4 ; CHECK-P8-NEXT: li r4, 80 -; CHECK-P8-NEXT: xscvdpuxws f6, f1 +; CHECK-P8-NEXT: xscvdpsxws f6, f1 ; CHECK-P8-NEXT: xxswapd vs1, vs1 ; CHECK-P8-NEXT: lxvd2x vs7, r3, r4 ; CHECK-P8-NEXT: li r4, 96 -; CHECK-P8-NEXT: xscvdpuxws f8, f2 +; CHECK-P8-NEXT: xscvdpsxws f8, f2 ; CHECK-P8-NEXT: xxswapd vs2, vs2 ; CHECK-P8-NEXT: lxvd2x vs9, r3, r4 ; CHECK-P8-NEXT: li r4, 112 -; CHECK-P8-NEXT: xscvdpuxws f10, f3 +; CHECK-P8-NEXT: xscvdpsxws f10, f3 ; CHECK-P8-NEXT: xxswapd vs3, vs3 ; CHECK-P8-NEXT: lxvd2x vs11, r3, r4 -; CHECK-P8-NEXT: xscvdpuxws f12, f5 +; CHECK-P8-NEXT: xscvdpsxws f12, f5 ; CHECK-P8-NEXT: xxswapd vs5, vs5 -; CHECK-P8-NEXT: xscvdpuxws f13, f7 +; CHECK-P8-NEXT: xscvdpsxws f13, f7 ; CHECK-P8-NEXT: xxswapd vs7, vs7 -; CHECK-P8-NEXT: xscvdpuxws v2, f9 +; CHECK-P8-NEXT: xscvdpsxws v2, f9 ; CHECK-P8-NEXT: xxswapd vs9, vs9 ; CHECK-P8-NEXT: mfvsrwz r3, f4 -; CHECK-P8-NEXT: xscvdpuxws v3, f11 +; CHECK-P8-NEXT: xscvdpsxws v3, f11 ; CHECK-P8-NEXT: xxswapd vs11, vs11 ; CHECK-P8-NEXT: mfvsrwz r4, f6 -; CHECK-P8-NEXT: xscvdpuxws f0, f0 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mtvsrd f4, r3 ; CHECK-P8-NEXT: mfvsrwz r3, f8 -; CHECK-P8-NEXT: xscvdpuxws f1, f1 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: xxswapd v4, vs4 ; CHECK-P8-NEXT: mtvsrd f6, r4 ; CHECK-P8-NEXT: mfvsrwz r4, f10 -; CHECK-P8-NEXT: xscvdpuxws f2, f2 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 ; CHECK-P8-NEXT: xxswapd v5, vs6 ; CHECK-P8-NEXT: mtvsrd f8, r3 ; CHECK-P8-NEXT: mfvsrwz r3, f12 -; CHECK-P8-NEXT: xscvdpuxws f3, f3 +; CHECK-P8-NEXT: xscvdpsxws f3, f3 ; CHECK-P8-NEXT: xxswapd v0, vs8 ; CHECK-P8-NEXT: mtvsrd f10, r4 ; CHECK-P8-NEXT: mfvsrwz r4, f13 -; CHECK-P8-NEXT: xscvdpuxws f5, f5 +; CHECK-P8-NEXT: xscvdpsxws f5, f5 ; CHECK-P8-NEXT: xxswapd v1, vs10 ; CHECK-P8-NEXT: mtvsrd f12, r3 ; CHECK-P8-NEXT: mfvsrwz r3, v2 -; CHECK-P8-NEXT: xscvdpuxws f7, f7 +; CHECK-P8-NEXT: xscvdpsxws f7, f7 ; CHECK-P8-NEXT: xxswapd v6, vs12 ; CHECK-P8-NEXT: mtvsrd f13, r4 ; CHECK-P8-NEXT: mfvsrwz r4, v3 ; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: xxswapd v7, vs13 ; CHECK-P8-NEXT: mfvsrwz r3, f0 -; CHECK-P8-NEXT: xscvdpuxws f9, f9 +; CHECK-P8-NEXT: xscvdpsxws f9, f9 ; CHECK-P8-NEXT: xxswapd v2, v2 -; CHECK-P8-NEXT: xscvdpuxws f11, f11 +; CHECK-P8-NEXT: xscvdpsxws f11, f11 ; CHECK-P8-NEXT: mtvsrd v3, r4 ; CHECK-P8-NEXT: mfvsrwz r4, f1 ; CHECK-P8-NEXT: mtvsrd f0, r3 @@ -462,22 +462,22 @@ define <16 x i8> @test16elt(<16 x double>* nocapture readonly) local_unnamed_add ; CHECK-P9-NEXT: xxswapd vs13, vs6 ; CHECK-P9-NEXT: xxswapd v2, vs1 ; CHECK-P9-NEXT: xxswapd v3, vs0 -; CHECK-P9-NEXT: xscvdpuxws f5, f5 -; CHECK-P9-NEXT: xscvdpuxws f4, f4 -; CHECK-P9-NEXT: xscvdpuxws f3, f3 -; CHECK-P9-NEXT: xscvdpuxws f2, f2 -; CHECK-P9-NEXT: xscvdpuxws f7, f7 -; CHECK-P9-NEXT: xscvdpuxws f6, f6 -; CHECK-P9-NEXT: xscvdpuxws f1, f1 -; CHECK-P9-NEXT: xscvdpuxws f0, f0 -; CHECK-P9-NEXT: xscvdpuxws f8, f8 -; CHECK-P9-NEXT: xscvdpuxws f9, f9 -; CHECK-P9-NEXT: xscvdpuxws f10, f10 -; CHECK-P9-NEXT: xscvdpuxws f11, f11 -; CHECK-P9-NEXT: xscvdpuxws f12, f12 -; CHECK-P9-NEXT: xscvdpuxws f13, f13 -; CHECK-P9-NEXT: xscvdpuxws v2, v2 -; CHECK-P9-NEXT: xscvdpuxws v3, v3 +; CHECK-P9-NEXT: xscvdpsxws f5, f5 +; CHECK-P9-NEXT: xscvdpsxws f4, f4 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: xscvdpsxws f7, f7 +; CHECK-P9-NEXT: xscvdpsxws f6, f6 +; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: xscvdpsxws f8, f8 +; CHECK-P9-NEXT: xscvdpsxws f9, f9 +; CHECK-P9-NEXT: xscvdpsxws f10, f10 +; CHECK-P9-NEXT: xscvdpsxws f11, f11 +; CHECK-P9-NEXT: xscvdpsxws f12, f12 +; CHECK-P9-NEXT: xscvdpsxws f13, f13 +; CHECK-P9-NEXT: xscvdpsxws v2, v2 +; CHECK-P9-NEXT: xscvdpsxws v3, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f5 ; CHECK-P9-NEXT: mfvsrwz r4, f4 ; CHECK-P9-NEXT: mfvsrwz r5, f3 @@ -571,22 +571,22 @@ define <16 x i8> @test16elt(<16 x double>* nocapture readonly) local_unnamed_add ; CHECK-BE-NEXT: xxswapd vs13, vs6 ; CHECK-BE-NEXT: xxswapd v2, vs1 ; CHECK-BE-NEXT: xxswapd v3, vs0 -; CHECK-BE-NEXT: xscvdpuxws f5, f5 -; CHECK-BE-NEXT: xscvdpuxws f4, f4 -; CHECK-BE-NEXT: xscvdpuxws f3, f3 -; CHECK-BE-NEXT: xscvdpuxws f2, f2 -; CHECK-BE-NEXT: xscvdpuxws f7, f7 -; CHECK-BE-NEXT: xscvdpuxws f6, f6 -; CHECK-BE-NEXT: xscvdpuxws f1, f1 -; CHECK-BE-NEXT: xscvdpuxws f0, f0 -; CHECK-BE-NEXT: xscvdpuxws f8, f8 -; CHECK-BE-NEXT: xscvdpuxws f9, f9 -; CHECK-BE-NEXT: xscvdpuxws f10, f10 -; CHECK-BE-NEXT: xscvdpuxws f11, f11 -; CHECK-BE-NEXT: xscvdpuxws f12, f12 -; CHECK-BE-NEXT: xscvdpuxws f13, f13 -; CHECK-BE-NEXT: xscvdpuxws v2, v2 -; CHECK-BE-NEXT: xscvdpuxws v3, v3 +; CHECK-BE-NEXT: xscvdpsxws f5, f5 +; CHECK-BE-NEXT: xscvdpsxws f4, f4 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: xscvdpsxws f7, f7 +; CHECK-BE-NEXT: xscvdpsxws f6, f6 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: xscvdpsxws f0, f0 +; CHECK-BE-NEXT: xscvdpsxws f8, f8 +; CHECK-BE-NEXT: xscvdpsxws f9, f9 +; CHECK-BE-NEXT: xscvdpsxws f10, f10 +; CHECK-BE-NEXT: xscvdpsxws f11, f11 +; CHECK-BE-NEXT: xscvdpsxws f12, f12 +; CHECK-BE-NEXT: xscvdpsxws f13, f13 +; CHECK-BE-NEXT: xscvdpsxws v2, v2 +; CHECK-BE-NEXT: xscvdpsxws v3, v3 ; CHECK-BE-NEXT: mfvsrwz r3, f5 ; CHECK-BE-NEXT: mfvsrwz r4, f4 ; CHECK-BE-NEXT: mfvsrwz r5, f3 diff --git a/llvm/test/CodeGen/X86/vec_cast2.ll b/llvm/test/CodeGen/X86/vec_cast2.ll index d746bf5..8e1ffe6 100644 --- a/llvm/test/CodeGen/X86/vec_cast2.ll +++ b/llvm/test/CodeGen/X86/vec_cast2.ll @@ -245,7 +245,7 @@ define <8 x i8> @cvt_v8f32_v8u8(<8 x float> %src) { ; CHECK-WIDE: ## %bb.0: ; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-WIDE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; CHECK-WIDE-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; CHECK-WIDE-NEXT: vzeroupper ; CHECK-WIDE-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll b/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll index 598a138..6c579f5 100644 --- a/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll @@ -2444,40 +2444,22 @@ define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) { define <8 x i16> @fptosi_8f64_to_8i16(<8 x double> %a) { ; SSE-LABEL: fptosi_8f64_to_8i16: ; SSE: # %bb.0: +; SSE-NEXT: cvttpd2dq %xmm3, %xmm3 +; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,1,3,4,5,6,7] -; SSE-NEXT: cvttpd2dq %xmm3, %xmm0 -; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: fptosi_8f64_to_8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptosi_8f64_to_8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; VEX-LABEL: fptosi_8f64_to_8i16: +; VEX: # %bb.0: +; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1 +; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VEX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; VEX-NEXT: vzeroupper +; VEX-NEXT: retq ; ; AVX512F-LABEL: fptosi_8f64_to_8i16: ; AVX512F: # %bb.0: @@ -2515,89 +2497,28 @@ define <8 x i16> @fptosi_8f64_to_8i16(<8 x double> %a) { define <8 x i16> @fptoui_8f64_to_8i16(<8 x double> %a) { ; SSE-LABEL: fptoui_8f64_to_8i16: ; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: movd %eax, %xmm4 -; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: movd %eax, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: cvttsd2si %xmm2, %rax -; SSE-NEXT: movd %eax, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: cvttsd2si %xmm2, %rax -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm3, %xmm3 +; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: retq ; -; AVX1-LABEL: fptoui_8f64_to_8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX1-NEXT: vcmpltpd %ymm2, %ymm1, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vsubpd %ymm2, %ymm1, %ymm4 -; AVX1-NEXT: vcvttpd2dq %ymm4, %xmm4 -; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vxorpd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX1-NEXT: vblendvps %xmm3, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vcmpltpd %ymm2, %ymm0, %ymm4 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6 -; AVX1-NEXT: vpackssdw %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vsubpd %ymm2, %ymm0, %ymm2 -; AVX1-NEXT: vcvttpd2dq %ymm2, %xmm2 -; AVX1-NEXT: vxorpd %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm4, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_8f64_to_8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX2-NEXT: vcmpltpd %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vsubpd %ymm2, %ymm1, %ymm4 -; AVX2-NEXT: vcvttpd2dq %ymm4, %xmm4 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorpd %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX2-NEXT: vblendvps %xmm3, %xmm1, %xmm4, %xmm1 -; AVX2-NEXT: vcmpltpd %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vcvttpd2dq %ymm2, %xmm2 -; AVX2-NEXT: vxorpd %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm3, %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; VEX-LABEL: fptoui_8f64_to_8i16: +; VEX: # %bb.0: +; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1 +; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VEX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; VEX-NEXT: vzeroupper +; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_8f64_to_8i16: ; AVX512F: # %bb.0: @@ -2636,17 +2557,12 @@ define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) { ; SSE-LABEL: fptosi_16f32_to_16i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm3, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255] -; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 ; SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: fptosi_16f32_to_16i8: @@ -2654,13 +2570,10 @@ define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) { ; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -2669,13 +2582,10 @@ define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) { ; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2693,16 +2603,11 @@ define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) { ; SSE-LABEL: fptoui_16f32_to_16i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm3, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255] -; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 ; SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -2710,13 +2615,10 @@ define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) { ; AVX1: # %bb.0: ; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2725,13 +2627,10 @@ define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) { ; AVX2: # %bb.0: ; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll index c0db65e..6ef7f20 100644 --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -2726,40 +2726,22 @@ define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) { define <8 x i16> @fptosi_8f64_to_8i16(<8 x double> %a) { ; SSE-LABEL: fptosi_8f64_to_8i16: ; SSE: # %bb.0: +; SSE-NEXT: cvttpd2dq %xmm3, %xmm3 +; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,1,3,4,5,6,7] -; SSE-NEXT: cvttpd2dq %xmm3, %xmm0 -; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: fptosi_8f64_to_8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptosi_8f64_to_8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; VEX-LABEL: fptosi_8f64_to_8i16: +; VEX: # %bb.0: +; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1 +; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VEX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; VEX-NEXT: vzeroupper +; VEX-NEXT: retq ; ; AVX512F-LABEL: fptosi_8f64_to_8i16: ; AVX512F: # %bb.0: @@ -2797,146 +2779,28 @@ define <8 x i16> @fptosi_8f64_to_8i16(<8 x double> %a) { define <8 x i16> @fptoui_8f64_to_8i16(<8 x double> %a) { ; SSE-LABEL: fptoui_8f64_to_8i16: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero -; SSE-NEXT: movapd %xmm1, %xmm5 -; SSE-NEXT: subsd %xmm4, %xmm5 -; SSE-NEXT: cvttsd2si %xmm5, %rcx -; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm1, %rdx -; SSE-NEXT: ucomisd %xmm4, %xmm1 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm1, %xmm6 -; SSE-NEXT: subsd %xmm4, %xmm6 -; SSE-NEXT: cvttsd2si %xmm6, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm1, %rdx -; SSE-NEXT: ucomisd %xmm4, %xmm1 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: subsd %xmm4, %xmm1 -; SSE-NEXT: cvttsd2si %xmm1, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm4, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movapd %xmm0, %xmm6 -; SSE-NEXT: subsd %xmm4, %xmm6 -; SSE-NEXT: cvttsd2si %xmm6, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm4, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE-NEXT: movapd %xmm3, %xmm0 -; SSE-NEXT: subsd %xmm4, %xmm0 -; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm3, %rdx -; SSE-NEXT: ucomisd %xmm4, %xmm3 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] -; SSE-NEXT: movapd %xmm3, %xmm5 -; SSE-NEXT: subsd %xmm4, %xmm5 -; SSE-NEXT: cvttsd2si %xmm5, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm3, %rdx -; SSE-NEXT: ucomisd %xmm4, %xmm3 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm3 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: cvttpd2dq %xmm3, %xmm3 +; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: movapd %xmm2, %xmm0 -; SSE-NEXT: subsd %xmm4, %xmm0 -; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm2, %rdx -; SSE-NEXT: ucomisd %xmm4, %xmm2 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: movapd %xmm2, %xmm5 -; SSE-NEXT: subsd %xmm4, %xmm5 -; SSE-NEXT: cvttsd2si %xmm5, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm2, %rax -; SSE-NEXT: ucomisd %xmm4, %xmm2 -; SSE-NEXT: cmovaeq %rcx, %rax -; SSE-NEXT: movq %rax, %xmm2 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: retq ; -; AVX1-LABEL: fptoui_8f64_to_8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX1-NEXT: vcmpltpd %ymm2, %ymm1, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vsubpd %ymm2, %ymm1, %ymm4 -; AVX1-NEXT: vcvttpd2dq %ymm4, %xmm4 -; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vxorpd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX1-NEXT: vblendvps %xmm3, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vcmpltpd %ymm2, %ymm0, %ymm4 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6 -; AVX1-NEXT: vpackssdw %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vsubpd %ymm2, %ymm0, %ymm2 -; AVX1-NEXT: vcvttpd2dq %ymm2, %xmm2 -; AVX1-NEXT: vxorpd %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm4, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_8f64_to_8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX2-NEXT: vcmpltpd %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vsubpd %ymm2, %ymm1, %ymm4 -; AVX2-NEXT: vcvttpd2dq %ymm4, %xmm4 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorpd %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX2-NEXT: vblendvps %xmm3, %xmm1, %xmm4, %xmm1 -; AVX2-NEXT: vcmpltpd %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vcvttpd2dq %ymm2, %xmm2 -; AVX2-NEXT: vxorpd %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm3, %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; VEX-LABEL: fptoui_8f64_to_8i16: +; VEX: # %bb.0: +; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1 +; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VEX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; VEX-NEXT: vzeroupper +; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_8f64_to_8i16: ; AVX512F: # %bb.0: @@ -2975,17 +2839,12 @@ define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) { ; SSE-LABEL: fptosi_16f32_to_16i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm3, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255] -; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 ; SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: fptosi_16f32_to_16i8: @@ -2993,13 +2852,10 @@ define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) { ; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -3008,13 +2864,10 @@ define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) { ; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3032,16 +2885,11 @@ define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) { ; SSE-LABEL: fptoui_16f32_to_16i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm3, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255] -; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 ; SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -3050,12 +2898,9 @@ define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) { ; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -3065,12 +2910,9 @@ define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) { ; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq