From a5d266b9cfc6b5cda1925636a14de13cf46a4743 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 26 Dec 2019 17:27:44 -0800 Subject: [PATCH] [X86] Add custom legalization for strict_uint_to_fp v2i32->v2f32. I believe the algorithm we use for non-strict is exception safe for strict. The fsub won't generate any exceptions. After it we will have an exact version of the i32 integer in a double. Then we just round it to f32. That rounding will generate a precision exception if it can't be represented exactly. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 23 +++-- llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll | 107 ++++----------------- .../X86/vector-constrained-fp-intrinsics.ll | 39 +++----- 3 files changed, 47 insertions(+), 122 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 94e4339..2359464 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1000,8 +1000,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom); // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. - // FIXME: Does this apply to STRICT_UINT_TO_FP? setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom); @@ -1857,8 +1857,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom); assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && + isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom); // v2i64 FP_TO_S/UINT(v2f32) custom conversion. setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); @@ -28926,8 +28926,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } return; } - // FIXME: Is this safe for strict fp? - if (SrcVT != MVT::v2i32 || IsSigned || IsStrict) + if (SrcVT != MVT::v2i32 || IsSigned) return; assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src); @@ -28936,9 +28935,19 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, DAG.getBitcast(MVT::v2i64, VBias)); Or = DAG.getBitcast(MVT::v2f64, Or); - // TODO: Are there any fast-math-flags to propagate here? - SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); - Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); + if (IsStrict) { + SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other}, + {N->getOperand(0), Or, VBias}); + SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, + {MVT::v4f32, MVT::Other}, + {Sub.getValue(1), Sub}); + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + } else { + // TODO: Are there any fast-math-flags to propagate here? + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); + Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); + } return; } case ISD::STRICT_FP_ROUND: diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll index ad5b285..daf4bb5 100644 --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll @@ -63,97 +63,24 @@ define <2 x float> @sitofp_v2i32_v2f32(<2 x i32> %x) #0 { } define <2 x float> @uitofp_v2i32_v2f32(<2 x i32> %x) #0 { -; SSE-32-LABEL: uitofp_v2i32_v2f32: -; SSE-32: # %bb.0: -; SSE-32-NEXT: xorps %xmm2, %xmm2 -; SSE-32-NEXT: xorps %xmm1, %xmm1 -; SSE-32-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-32-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; SSE-32-NEXT: orps %xmm3, %xmm1 -; SSE-32-NEXT: subsd %xmm3, %xmm1 -; SSE-32-NEXT: cvtsd2ss %xmm1, %xmm1 -; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-32-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-32-NEXT: orps %xmm3, %xmm2 -; SSE-32-NEXT: subsd %xmm3, %xmm2 -; SSE-32-NEXT: xorps %xmm0, %xmm0 -; SSE-32-NEXT: cvtsd2ss %xmm2, %xmm0 -; SSE-32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-32-NEXT: movaps %xmm1, %xmm0 -; SSE-32-NEXT: retl -; -; SSE-64-LABEL: uitofp_v2i32_v2f32: -; SSE-64: # %bb.0: -; SSE-64-NEXT: movd %xmm0, %eax -; SSE-64-NEXT: cvtsi2ss %rax, %xmm1 -; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-64-NEXT: movd %xmm0, %eax -; SSE-64-NEXT: xorps %xmm0, %xmm0 -; SSE-64-NEXT: cvtsi2ss %rax, %xmm0 -; SSE-64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-64-NEXT: movaps %xmm1, %xmm0 -; SSE-64-NEXT: retq -; -; AVX1-32-LABEL: uitofp_v2i32_v2f32: -; AVX1-32: # %bb.0: -; AVX1-32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-32-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3] -; AVX1-32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX1-32-NEXT: vorps %xmm3, %xmm2, %xmm2 -; AVX1-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 -; AVX1-32-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 -; AVX1-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX1-32-NEXT: vorps %xmm3, %xmm0, %xmm0 -; AVX1-32-NEXT: vsubsd %xmm3, %xmm0, %xmm0 -; AVX1-32-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3] -; AVX1-32-NEXT: retl -; -; AVX1-64-LABEL: uitofp_v2i32_v2f32: -; AVX1-64: # %bb.0: -; AVX1-64-NEXT: vextractps $1, %xmm0, %eax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX1-64-NEXT: vmovd %xmm0, %eax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX1-64-NEXT: retq -; -; AVX512F-LABEL: uitofp_v2i32_v2f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vextractps $1, %xmm0, %eax -; AVX512F-NEXT: vcvtusi2ss %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vcvtusi2ss %eax, %xmm2, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX512F-NEXT: ret{{[l|q]}} -; -; AVX512VL-LABEL: uitofp_v2i32_v2f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextractps $1, %xmm0, %eax -; AVX512VL-NEXT: vcvtusi2ss %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: vcvtusi2ss %eax, %xmm2, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX512VL-NEXT: ret{{[l|q]}} -; -; AVX512DQ-LABEL: uitofp_v2i32_v2f32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextractps $1, %xmm0, %eax -; AVX512DQ-NEXT: vcvtusi2ss %eax, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovd %xmm0, %eax -; AVX512DQ-NEXT: vcvtusi2ss %eax, %xmm2, %xmm0 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX512DQ-NEXT: ret{{[l|q]}} +; SSE-LABEL: uitofp_v2i32_v2f32: +; SSE: # %bb.0: +; SSE-NEXT: xorpd %xmm1, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE-NEXT: orpd %xmm1, %xmm0 +; SSE-NEXT: subpd %xmm1, %xmm0 +; SSE-NEXT: cvtpd2ps %xmm0, %xmm0 +; SSE-NEXT: ret{{[l|q]}} ; -; AVX512DQVL-LABEL: uitofp_v2i32_v2f32: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vextractps $1, %xmm0, %eax -; AVX512DQVL-NEXT: vcvtusi2ss %eax, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vmovd %xmm0, %eax -; AVX512DQVL-NEXT: vcvtusi2ss %eax, %xmm2, %xmm0 -; AVX512DQVL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX512DQVL-NEXT: ret{{[l|q]}} +; AVX-LABEL: uitofp_v2i32_v2f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vcvtpd2ps %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %result = call <2 x float> @llvm.experimental.constrained.uitofp.v2f32.v2i32(<2 x i32> %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index fc92546..e131f61 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -6836,33 +6836,22 @@ entry: define <2 x float> @constrained_vector_uitofp_v2f32_v2i32(<2 x i32> %x) #0 { ; CHECK-LABEL: constrained_vector_uitofp_v2f32_v2i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: cvtsi2ss %rax, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: cvtsi2ss %rax, %xmm0 -; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: xorpd %xmm1, %xmm1 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; CHECK-NEXT: orpd %xmm1, %xmm0 +; CHECK-NEXT: subpd %xmm1, %xmm0 +; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0 ; CHECK-NEXT: retq ; -; AVX1-LABEL: constrained_vector_uitofp_v2f32_v2i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractps $1, %xmm0, %eax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX1-NEXT: retq -; -; AVX512-LABEL: constrained_vector_uitofp_v2f32_v2i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vextractps $1, %xmm0, %eax -; AVX512-NEXT: vcvtusi2ss %eax, %xmm1, %xmm1 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vcvtusi2ss %eax, %xmm2, %xmm0 -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX512-NEXT: retq +; AVX-LABEL: constrained_vector_uitofp_v2f32_v2i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vcvtpd2ps %xmm0, %xmm0 +; AVX-NEXT: retq entry: %result = call <2 x float> @llvm.experimental.constrained.uitofp.v2f32.v2i32(<2 x i32> %x, -- 2.7.4