D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence:
// Sel = Src < 0x8000000000000000
// Val = select Sel, Src, Src - 0x8000000000000000
// Ofs = select Sel, 0, 0x8000000000000000
// Result = fp_to_sint(Val) ^ Ofs
The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.)
Instead, I'd suggest to use the following sequence:
// Sel = Src < 0x8000000000000000
// FltOfs = select Sel, 0, 0x8000000000000000
// IntOfs = select Sel, 0, 0x8000000000000000
// Result = fp_to_sint(Val - FltOfs) ^ IntOfs
In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway).
In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit.
There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.)
Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper
Differential Revision: https://reviews.llvm.org/D67105
// Expand based on maximum range of FP_TO_SINT, if the value exceeds the
// signmask then offset (the result of which should be fully representable).
// Sel = Src < 0x8000000000000000
- // Val = select Sel, Src, Src - 0x8000000000000000
- // Ofs = select Sel, 0, 0x8000000000000000
- // Result = fp_to_sint(Val) ^ Ofs
+ // FltOfs = select Sel, 0, 0x8000000000000000
+ // IntOfs = select Sel, 0, 0x8000000000000000
+ // Result = fp_to_sint(Src - FltOfs) ^ IntOfs
// TODO: Should any fast-math-flags be set for the FSUB?
- SDValue SrcBiased;
- if (Node->isStrictFPOpcode())
- SrcBiased = DAG.getNode(ISD::STRICT_FSUB, dl, { SrcVT, MVT::Other },
- { Node->getOperand(0), Src, Cst });
- else
- SrcBiased = DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Cst);
- SDValue Val = DAG.getSelect(dl, SrcVT, Sel, Src, SrcBiased);
- SDValue Ofs = DAG.getSelect(dl, DstVT, Sel, DAG.getConstant(0, dl, DstVT),
- DAG.getConstant(SignMask, dl, DstVT));
+ SDValue FltOfs = DAG.getSelect(dl, SrcVT, Sel,
+ DAG.getConstantFP(0.0, dl, SrcVT), Cst);
+ SDValue IntOfs = DAG.getSelect(dl, DstVT, Sel,
+ DAG.getConstant(0, dl, DstVT),
+ DAG.getConstant(SignMask, dl, DstVT));
SDValue SInt;
if (Node->isStrictFPOpcode()) {
+ SDValue Val = DAG.getNode(ISD::STRICT_FSUB, dl, { SrcVT, MVT::Other },
+ { Node->getOperand(0), Src, FltOfs });
SInt = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { DstVT, MVT::Other },
- { SrcBiased.getValue(1), Val });
+ { Val.getValue(1), Val });
Chain = SInt.getValue(1);
- } else
+ } else {
+ SDValue Val = DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FltOfs);
SInt = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Val);
- Result = DAG.getNode(ISD::XOR, dl, DstVT, SInt, Ofs);
+ }
+ Result = DAG.getNode(ISD::XOR, dl, DstVT, SInt, IntOfs);
} else {
// Expand based on maximum range of FP_TO_SINT:
// True = fp_to_sint(Src)
// of a signed i64. Let Thresh be the FP equivalent of
// 0x8000000000000000ULL.
//
- // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
- // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
+ // Adjust = (Value < Thresh) ? 0 : 0x80000000;
+ // FltOfs = (Value < Thresh) ? 0 : 0x80000000;
+ // FistSrc = (Value - FltOfs);
// Fist-to-mem64 FistSrc
// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
// to XOR'ing the high 32 bits with Adjust.
// Being a power of 2, Thresh is exactly representable in all FP formats.
// For X87 we'd like to use the smallest FP type for this constant, but
// for DAG type consistency we have to match the FP operand type.
- // FIXME: This code generates a spurious inexact exception for 1.0.
APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
DAG.getConstant(0, DL, MVT::i64),
DAG.getConstant(APInt::getSignMask(64),
DL, MVT::i64));
- SDValue Sub;
+ SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp,
+ DAG.getConstantFP(0.0, DL, TheVT),
+ ThreshVal);
+
if (IsStrict) {
- Sub = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
- { Chain, Value, ThreshVal });
- Chain = Sub.getValue(1);
+ Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
+ { Chain, Value, FltOfs });
+ Chain = Value.getValue(1);
} else
- Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
-
- Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
- *DAG.getContext(), TheVT),
- Value, ThreshVal, ISD::SETLT);
- Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
+ Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
}
MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
; CHECK-NEXT: larl %r1, .LCPI0_0
; CHECK-NEXT: le %f1, 0(%r1)
; CHECK-NEXT: cebr %f0, %f1
-; CHECK-NEXT: lhi %r0, 0
-; CHECK-NEXT: jl .LBB0_2
+; CHECK-NEXT: jnl .LBB0_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: sebr %f0, %f1
-; CHECK-NEXT: llilh %r0, 32768
+; CHECK-NEXT: lhi %r0, 0
+; CHECK-NEXT: lzer %f1
+; CHECK-NEXT: j .LBB0_3
; CHECK-NEXT: .LBB0_2:
+; CHECK-NEXT: llilh %r0, 32768
+; CHECK-NEXT: .LBB0_3:
+; CHECK-NEXT: sebr %f0, %f1
; CHECK-NEXT: cfebr %r2, 5, %f0
; CHECK-NEXT: xr %r2, %r0
; CHECK-NEXT: br %r14
; CHECK-NEXT: larl %r1, .LCPI1_0
; CHECK-NEXT: ldeb %f1, 0(%r1)
; CHECK-NEXT: cdbr %f0, %f1
-; CHECK-NEXT: lhi %r0, 0
-; CHECK-NEXT: jl .LBB1_2
+; CHECK-NEXT: jnl .LBB1_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: sdbr %f0, %f1
-; CHECK-NEXT: llilh %r0, 32768
+; CHECK-NEXT: lhi %r0, 0
+; CHECK-NEXT: lzdr %f1
+; CHECK-NEXT: j .LBB1_3
; CHECK-NEXT: .LBB1_2:
+; CHECK-NEXT: llilh %r0, 32768
+; CHECK-NEXT: .LBB1_3:
+; CHECK-NEXT: sdbr %f0, %f1
; CHECK-NEXT: cfdbr %r2, 5, %f0
; CHECK-NEXT: xr %r2, %r0
; CHECK-NEXT: br %r14
; CHECK-NEXT: larl %r1, .LCPI2_0
; CHECK-NEXT: lxeb %f1, 0(%r1)
; CHECK-NEXT: cxbr %f0, %f1
-; CHECK-NEXT: lhi %r0, 0
-; CHECK-NEXT: jl .LBB2_2
+; CHECK-NEXT: jnl .LBB2_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: sxbr %f0, %f1
-; CHECK-NEXT: llilh %r0, 32768
+; CHECK-NEXT: lhi %r0, 0
+; CHECK-NEXT: lzxr %f1
+; CHECK-NEXT: j .LBB2_3
; CHECK-NEXT: .LBB2_2:
+; CHECK-NEXT: llilh %r0, 32768
+; CHECK-NEXT: .LBB2_3:
+; CHECK-NEXT: sxbr %f0, %f1
; CHECK-NEXT: cfxbr %r2, 5, %f0
; CHECK-NEXT: xr %r2, %r0
; CHECK-NEXT: br %r14
; CHECK-NEXT: larl %r1, .LCPI0_0
; CHECK-NEXT: le %f1, 0(%r1)
; CHECK-NEXT: cebr %f0, %f1
-; CHECK-NEXT: lghi %r0, 0
-; CHECK-NEXT: jl .LBB0_2
+; CHECK-NEXT: jnl .LBB0_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: sebr %f0, %f1
-; CHECK-NEXT: llihh %r0, 32768
+; CHECK-NEXT: lghi %r0, 0
+; CHECK-NEXT: lzer %f1
+; CHECK-NEXT: j .LBB0_3
; CHECK-NEXT: .LBB0_2:
+; CHECK-NEXT: llihh %r0, 32768
+; CHECK-NEXT: .LBB0_3:
+; CHECK-NEXT: sebr %f0, %f1
; CHECK-NEXT: cgebr %r2, 5, %f0
; CHECK-NEXT: xgr %r2, %r0
; CHECK-NEXT: br %r14
; CHECK-NEXT: larl %r1, .LCPI1_0
; CHECK-NEXT: ldeb %f1, 0(%r1)
; CHECK-NEXT: cdbr %f0, %f1
-; CHECK-NEXT: lghi %r0, 0
-; CHECK-NEXT: jl .LBB1_2
+; CHECK-NEXT: jnl .LBB1_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: sdbr %f0, %f1
-; CHECK-NEXT: llihh %r0, 32768
+; CHECK-NEXT: lghi %r0, 0
+; CHECK-NEXT: lzdr %f1
+; CHECK-NEXT: j .LBB1_3
; CHECK-NEXT: .LBB1_2:
+; CHECK-NEXT: llihh %r0, 32768
+; CHECK-NEXT: .LBB1_3:
+; CHECK-NEXT: sdbr %f0, %f1
; CHECK-NEXT: cgdbr %r2, 5, %f0
; CHECK-NEXT: xgr %r2, %r0
; CHECK-NEXT: br %r14
; CHECK-NEXT: larl %r1, .LCPI2_0
; CHECK-NEXT: lxeb %f1, 0(%r1)
; CHECK-NEXT: cxbr %f0, %f1
-; CHECK-NEXT: lghi %r0, 0
-; CHECK-NEXT: jl .LBB2_2
+; CHECK-NEXT: jnl .LBB2_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: sxbr %f0, %f1
-; CHECK-NEXT: llihh %r0, 32768
+; CHECK-NEXT: lghi %r0, 0
+; CHECK-NEXT: lzxr %f1
+; CHECK-NEXT: j .LBB2_3
; CHECK-NEXT: .LBB2_2:
+; CHECK-NEXT: llihh %r0, 32768
+; CHECK-NEXT: .LBB2_3:
+; CHECK-NEXT: sxbr %f0, %f1
; CHECK-NEXT: cgxbr %r2, 5, %f0
; CHECK-NEXT: xgr %r2, %r0
; CHECK-NEXT: br %r14
; X86-NEXT: subl $16, %esp
; X86-NEXT: fldt 8(%ebp)
; X86-NEXT: flds {{\.LCPI.*}}
-; X86-NEXT: fld %st(1)
-; X86-NEXT: fsub %st(1), %st
-; X86-NEXT: fxch %st(1)
-; X86-NEXT: fucomp %st(2)
+; X86-NEXT: fucom %st(1)
; X86-NEXT: fnstsw %ax
+; X86-NEXT: xorl %edx, %edx
; X86-NEXT: # kill: def $ah killed $ah killed $ax
; X86-NEXT: sahf
+; X86-NEXT: setbe %al
+; X86-NEXT: fldz
; X86-NEXT: ja .LBB10_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: fstp %st(1)
+; X86-NEXT: fstp %st(0)
; X86-NEXT: fldz
+; X86-NEXT: fxch %st(1)
; X86-NEXT: .LBB10_2:
-; X86-NEXT: fstp %st(0)
-; X86-NEXT: setbe %al
+; X86-NEXT: fstp %st(1)
+; X86-NEXT: fsubrp %st, %st(1)
; X86-NEXT: fnstcw {{[0-9]+}}(%esp)
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: orl $3072, %ecx # imm = 0xC00
; X86-NEXT: fldcw {{[0-9]+}}(%esp)
; X86-NEXT: fistpll {{[0-9]+}}(%esp)
; X86-NEXT: fldcw {{[0-9]+}}(%esp)
-; X86-NEXT: movzbl %al, %edx
+; X86-NEXT: movb %al, %dl
; X86-NEXT: shll $31, %edx
; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X64-X87: # %bb.0:
; X64-X87-NEXT: fldt {{[0-9]+}}(%rsp)
; X64-X87-NEXT: flds {{.*}}(%rip)
-; X64-X87-NEXT: fld %st(1)
-; X64-X87-NEXT: fsub %st(1), %st
; X64-X87-NEXT: xorl %eax, %eax
+; X64-X87-NEXT: fucomi %st(1), %st
+; X64-X87-NEXT: setbe %al
+; X64-X87-NEXT: fldz
; X64-X87-NEXT: fxch %st(1)
-; X64-X87-NEXT: fucompi %st(2), %st
; X64-X87-NEXT: fcmovnbe %st(1), %st
; X64-X87-NEXT: fstp %st(1)
-; X64-X87-NEXT: setbe %al
+; X64-X87-NEXT: fsubrp %st, %st(1)
; X64-X87-NEXT: fnstcw -{{[0-9]+}}(%rsp)
; X64-X87-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx
; X64-X87-NEXT: orl $3072, %ecx # imm = 0xC00
; X64-SSSE3: # %bb.0:
; X64-SSSE3-NEXT: fldt {{[0-9]+}}(%rsp)
; X64-SSSE3-NEXT: flds {{.*}}(%rip)
-; X64-SSSE3-NEXT: fld %st(1)
-; X64-SSSE3-NEXT: fsub %st(1), %st
; X64-SSSE3-NEXT: xorl %eax, %eax
+; X64-SSSE3-NEXT: fucomi %st(1), %st
+; X64-SSSE3-NEXT: fldz
; X64-SSSE3-NEXT: fxch %st(1)
-; X64-SSSE3-NEXT: fucompi %st(2), %st
; X64-SSSE3-NEXT: fcmovnbe %st(1), %st
; X64-SSSE3-NEXT: fstp %st(1)
+; X64-SSSE3-NEXT: fsubrp %st, %st(1)
; X64-SSSE3-NEXT: fisttpll -{{[0-9]+}}(%rsp)
; X64-SSSE3-NEXT: setbe %al
; X64-SSSE3-NEXT: shlq $63, %rax
; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: fldt (%eax)
; X86-NEXT: flds {{\.LCPI.*}}
-; X86-NEXT: fld %st(1)
-; X86-NEXT: fsub %st(1), %st
-; X86-NEXT: fxch %st(1)
-; X86-NEXT: fucomp %st(2)
+; X86-NEXT: fucom %st(1)
; X86-NEXT: fnstsw %ax
+; X86-NEXT: xorl %edx, %edx
; X86-NEXT: # kill: def $ah killed $ah killed $ax
; X86-NEXT: sahf
+; X86-NEXT: setbe %al
+; X86-NEXT: fldz
; X86-NEXT: ja .LBB11_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: fstp %st(1)
+; X86-NEXT: fstp %st(0)
; X86-NEXT: fldz
+; X86-NEXT: fxch %st(1)
; X86-NEXT: .LBB11_2:
-; X86-NEXT: fstp %st(0)
-; X86-NEXT: setbe %al
+; X86-NEXT: fstp %st(1)
+; X86-NEXT: fsubrp %st, %st(1)
; X86-NEXT: fnstcw {{[0-9]+}}(%esp)
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: orl $3072, %ecx # imm = 0xC00
; X86-NEXT: fldcw {{[0-9]+}}(%esp)
; X86-NEXT: fistpll {{[0-9]+}}(%esp)
; X86-NEXT: fldcw {{[0-9]+}}(%esp)
-; X86-NEXT: movzbl %al, %edx
+; X86-NEXT: movb %al, %dl
; X86-NEXT: shll $31, %edx
; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X64-X87: # %bb.0:
; X64-X87-NEXT: fldt (%rdi)
; X64-X87-NEXT: flds {{.*}}(%rip)
-; X64-X87-NEXT: fld %st(1)
-; X64-X87-NEXT: fsub %st(1), %st
; X64-X87-NEXT: xorl %eax, %eax
+; X64-X87-NEXT: fucomi %st(1), %st
+; X64-X87-NEXT: setbe %al
+; X64-X87-NEXT: fldz
; X64-X87-NEXT: fxch %st(1)
-; X64-X87-NEXT: fucompi %st(2), %st
; X64-X87-NEXT: fcmovnbe %st(1), %st
; X64-X87-NEXT: fstp %st(1)
-; X64-X87-NEXT: setbe %al
+; X64-X87-NEXT: fsubrp %st, %st(1)
; X64-X87-NEXT: fnstcw -{{[0-9]+}}(%rsp)
; X64-X87-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx
; X64-X87-NEXT: orl $3072, %ecx # imm = 0xC00
; X64-SSSE3: # %bb.0:
; X64-SSSE3-NEXT: fldt (%rdi)
; X64-SSSE3-NEXT: flds {{.*}}(%rip)
-; X64-SSSE3-NEXT: fld %st(1)
-; X64-SSSE3-NEXT: fsub %st(1), %st
; X64-SSSE3-NEXT: xorl %eax, %eax
+; X64-SSSE3-NEXT: fucomi %st(1), %st
+; X64-SSSE3-NEXT: fldz
; X64-SSSE3-NEXT: fxch %st(1)
-; X64-SSSE3-NEXT: fucompi %st(2), %st
; X64-SSSE3-NEXT: fcmovnbe %st(1), %st
; X64-SSSE3-NEXT: fstp %st(1)
+; X64-SSSE3-NEXT: fsubrp %st, %st(1)
; X64-SSSE3-NEXT: fisttpll -{{[0-9]+}}(%rsp)
; X64-SSSE3-NEXT: setbe %al
; X64-SSSE3-NEXT: shlq $63, %rax
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT: xorl %ecx, %ecx
+; X86-SSE-NEXT: ucomisd %xmm0, %xmm1
+; X86-SSE-NEXT: setbe %cl
+; X86-SSE-NEXT: shll $31, %ecx
; X86-SSE-NEXT: movapd %xmm0, %xmm2
; X86-SSE-NEXT: cmpltsd %xmm1, %xmm2
-; X86-SSE-NEXT: movapd %xmm2, %xmm3
-; X86-SSE-NEXT: andpd %xmm0, %xmm2
-; X86-SSE-NEXT: xorl %eax, %eax
-; X86-SSE-NEXT: ucomisd %xmm0, %xmm1
-; X86-SSE-NEXT: subsd %xmm1, %xmm0
-; X86-SSE-NEXT: andnpd %xmm0, %xmm3
-; X86-SSE-NEXT: orpd %xmm3, %xmm2
-; X86-SSE-NEXT: cvttsd2si %xmm2, %ecx
-; X86-SSE-NEXT: setbe %al
-; X86-SSE-NEXT: shll $31, %eax
+; X86-SSE-NEXT: andnpd %xmm1, %xmm2
+; X86-SSE-NEXT: subsd %xmm2, %xmm0
+; X86-SSE-NEXT: cvttsd2si %xmm0, %eax
; X86-SSE-NEXT: xorl %ecx, %eax
; X86-SSE-NEXT: retl
;
; X87-NEXT: .cfi_def_cfa_offset 24
; X87-NEXT: fldl {{[0-9]+}}(%esp)
; X87-NEXT: flds {{\.LCPI.*}}
-; X87-NEXT: fld %st(1)
-; X87-NEXT: fsub %st(1), %st
; X87-NEXT: xorl %edx, %edx
+; X87-NEXT: fucomi %st(1), %st
+; X87-NEXT: setbe %dl
+; X87-NEXT: fldz
; X87-NEXT: fxch %st(1)
-; X87-NEXT: fucompi %st(2), %st
; X87-NEXT: fcmovnbe %st(1), %st
; X87-NEXT: fstp %st(1)
-; X87-NEXT: setbe %dl
+; X87-NEXT: fsubrp %st, %st(1)
; X87-NEXT: fnstcw {{[0-9]+}}(%esp)
; X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X87-NEXT: orl $3072, %eax # imm = 0xC00
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; X86-SSE-NEXT: movapd %xmm0, %xmm2
-; X86-SSE-NEXT: subsd %xmm1, %xmm2
+; X86-SSE-NEXT: cmpltsd %xmm1, %xmm2
+; X86-SSE-NEXT: andnpd %xmm1, %xmm2
; X86-SSE-NEXT: movapd %xmm0, %xmm3
-; X86-SSE-NEXT: cmpltsd %xmm1, %xmm3
-; X86-SSE-NEXT: movapd %xmm3, %xmm4
-; X86-SSE-NEXT: andnpd %xmm2, %xmm4
-; X86-SSE-NEXT: andpd %xmm0, %xmm3
-; X86-SSE-NEXT: orpd %xmm4, %xmm3
-; X86-SSE-NEXT: movlpd %xmm3, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: subsd %xmm2, %xmm3
+; X86-SSE-NEXT: movsd %xmm3, {{[0-9]+}}(%esp)
; X86-SSE-NEXT: fldl {{[0-9]+}}(%esp)
; X86-SSE-NEXT: fnstcw {{[0-9]+}}(%esp)
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; SSE-LABEL: f20u64:
; SSE: # %bb.0: # %entry
; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: xorl %ecx, %ecx
+; SSE-NEXT: ucomisd %xmm1, %xmm0
+; SSE-NEXT: setae %cl
+; SSE-NEXT: shlq $63, %rcx
; SSE-NEXT: movapd %xmm0, %xmm2
; SSE-NEXT: cmpltsd %xmm1, %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm3
-; SSE-NEXT: andpd %xmm0, %xmm2
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: ucomisd %xmm1, %xmm0
-; SSE-NEXT: subsd %xmm1, %xmm0
-; SSE-NEXT: andnpd %xmm0, %xmm3
-; SSE-NEXT: orpd %xmm3, %xmm2
-; SSE-NEXT: cvttsd2si %xmm2, %rcx
-; SSE-NEXT: setae %al
-; SSE-NEXT: shlq $63, %rax
+; SSE-NEXT: andnpd %xmm1, %xmm2
+; SSE-NEXT: subsd %xmm2, %xmm0
+; SSE-NEXT: cvttsd2si %xmm0, %rax
; SSE-NEXT: xorq %rcx, %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: f20u64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vcmpltsd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm2
-; AVX1-NEXT: vcvttsd2si %xmm2, %rcx
-; AVX1-NEXT: xorl %eax, %eax
+; AVX1-NEXT: xorl %ecx, %ecx
; AVX1-NEXT: vucomisd %xmm1, %xmm0
-; AVX1-NEXT: setae %al
-; AVX1-NEXT: shlq $63, %rax
+; AVX1-NEXT: setae %cl
+; AVX1-NEXT: shlq $63, %rcx
+; AVX1-NEXT: vcmpltsd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vandnpd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vcvttsd2si %xmm0, %rax
; AVX1-NEXT: xorq %rcx, %rax
; AVX1-NEXT: retq
;
; AVX512F_32_WIN-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512F_32_WIN-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512F_32_WIN-NEXT: vcmpltss %xmm1, %xmm0, %k1
-; AVX512F_32_WIN-NEXT: vsubss %xmm1, %xmm0, %xmm2
-; AVX512F_32_WIN-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
-; AVX512F_32_WIN-NEXT: vmovss %xmm2, (%esp)
-; AVX512F_32_WIN-NEXT: flds (%esp)
-; AVX512F_32_WIN-NEXT: fisttpll (%esp)
+; AVX512F_32_WIN-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX512F_32_WIN-NEXT: xorl %edx, %edx
; AVX512F_32_WIN-NEXT: vucomiss %xmm0, %xmm1
+; AVX512F_32_WIN-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512F_32_WIN-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX512F_32_WIN-NEXT: vmovss %xmm0, (%esp)
+; AVX512F_32_WIN-NEXT: flds (%esp)
+; AVX512F_32_WIN-NEXT: fisttpll (%esp)
; AVX512F_32_WIN-NEXT: setbe %dl
; AVX512F_32_WIN-NEXT: shll $31, %edx
; AVX512F_32_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; AVX512F_32_LIN-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512F_32_LIN-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512F_32_LIN-NEXT: vcmpltss %xmm1, %xmm0, %k1
-; AVX512F_32_LIN-NEXT: vsubss %xmm1, %xmm0, %xmm2
-; AVX512F_32_LIN-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
-; AVX512F_32_LIN-NEXT: vmovss %xmm2, (%esp)
-; AVX512F_32_LIN-NEXT: flds (%esp)
-; AVX512F_32_LIN-NEXT: fisttpll (%esp)
+; AVX512F_32_LIN-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX512F_32_LIN-NEXT: xorl %edx, %edx
; AVX512F_32_LIN-NEXT: vucomiss %xmm0, %xmm1
+; AVX512F_32_LIN-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512F_32_LIN-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX512F_32_LIN-NEXT: vmovss %xmm0, (%esp)
+; AVX512F_32_LIN-NEXT: flds (%esp)
+; AVX512F_32_LIN-NEXT: fisttpll (%esp)
; AVX512F_32_LIN-NEXT: setbe %dl
; AVX512F_32_LIN-NEXT: shll $31, %edx
; AVX512F_32_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; SSE3_32_WIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE3_32_WIN-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE3_32_WIN-NEXT: movaps %xmm0, %xmm2
-; SSE3_32_WIN-NEXT: cmpltss %xmm1, %xmm2
-; SSE3_32_WIN-NEXT: movaps %xmm2, %xmm3
-; SSE3_32_WIN-NEXT: andps %xmm0, %xmm2
; SSE3_32_WIN-NEXT: xorl %edx, %edx
; SSE3_32_WIN-NEXT: ucomiss %xmm0, %xmm1
-; SSE3_32_WIN-NEXT: subss %xmm1, %xmm0
-; SSE3_32_WIN-NEXT: andnps %xmm0, %xmm3
-; SSE3_32_WIN-NEXT: orps %xmm3, %xmm2
+; SSE3_32_WIN-NEXT: cmpltss %xmm1, %xmm0
+; SSE3_32_WIN-NEXT: andnps %xmm1, %xmm0
+; SSE3_32_WIN-NEXT: subss %xmm0, %xmm2
; SSE3_32_WIN-NEXT: movss %xmm2, (%esp)
; SSE3_32_WIN-NEXT: flds (%esp)
; SSE3_32_WIN-NEXT: fisttpll (%esp)
; SSE3_32_LIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE3_32_LIN-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE3_32_LIN-NEXT: movaps %xmm0, %xmm2
-; SSE3_32_LIN-NEXT: cmpltss %xmm1, %xmm2
-; SSE3_32_LIN-NEXT: movaps %xmm2, %xmm3
-; SSE3_32_LIN-NEXT: andps %xmm0, %xmm2
; SSE3_32_LIN-NEXT: xorl %edx, %edx
; SSE3_32_LIN-NEXT: ucomiss %xmm0, %xmm1
-; SSE3_32_LIN-NEXT: subss %xmm1, %xmm0
-; SSE3_32_LIN-NEXT: andnps %xmm0, %xmm3
-; SSE3_32_LIN-NEXT: orps %xmm3, %xmm2
+; SSE3_32_LIN-NEXT: cmpltss %xmm1, %xmm0
+; SSE3_32_LIN-NEXT: andnps %xmm1, %xmm0
+; SSE3_32_LIN-NEXT: subss %xmm0, %xmm2
; SSE3_32_LIN-NEXT: movss %xmm2, (%esp)
; SSE3_32_LIN-NEXT: flds (%esp)
; SSE3_32_LIN-NEXT: fisttpll (%esp)
; SSE2_32_WIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2_32_WIN-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2_32_WIN-NEXT: movaps %xmm0, %xmm2
-; SSE2_32_WIN-NEXT: subss %xmm1, %xmm2
+; SSE2_32_WIN-NEXT: cmpltss %xmm1, %xmm2
+; SSE2_32_WIN-NEXT: andnps %xmm1, %xmm2
; SSE2_32_WIN-NEXT: movaps %xmm0, %xmm3
-; SSE2_32_WIN-NEXT: cmpltss %xmm1, %xmm3
-; SSE2_32_WIN-NEXT: movaps %xmm3, %xmm4
-; SSE2_32_WIN-NEXT: andnps %xmm2, %xmm4
-; SSE2_32_WIN-NEXT: andps %xmm0, %xmm3
-; SSE2_32_WIN-NEXT: orps %xmm4, %xmm3
+; SSE2_32_WIN-NEXT: subss %xmm2, %xmm3
; SSE2_32_WIN-NEXT: movss %xmm3, {{[0-9]+}}(%esp)
; SSE2_32_WIN-NEXT: flds {{[0-9]+}}(%esp)
; SSE2_32_WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; SSE2_32_LIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2_32_LIN-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2_32_LIN-NEXT: movaps %xmm0, %xmm2
-; SSE2_32_LIN-NEXT: subss %xmm1, %xmm2
+; SSE2_32_LIN-NEXT: cmpltss %xmm1, %xmm2
+; SSE2_32_LIN-NEXT: andnps %xmm1, %xmm2
; SSE2_32_LIN-NEXT: movaps %xmm0, %xmm3
-; SSE2_32_LIN-NEXT: cmpltss %xmm1, %xmm3
-; SSE2_32_LIN-NEXT: movaps %xmm3, %xmm4
-; SSE2_32_LIN-NEXT: andnps %xmm2, %xmm4
-; SSE2_32_LIN-NEXT: andps %xmm0, %xmm3
-; SSE2_32_LIN-NEXT: orps %xmm4, %xmm3
+; SSE2_32_LIN-NEXT: subss %xmm2, %xmm3
; SSE2_32_LIN-NEXT: movss %xmm3, {{[0-9]+}}(%esp)
; SSE2_32_LIN-NEXT: flds {{[0-9]+}}(%esp)
; SSE2_32_LIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; X87_WIN-NEXT: subl $16, %esp
; X87_WIN-NEXT: flds 8(%ebp)
; X87_WIN-NEXT: flds __real@5f000000
-; X87_WIN-NEXT: fld %st(1)
-; X87_WIN-NEXT: fsub %st(1), %st
-; X87_WIN-NEXT: fxch %st(1)
-; X87_WIN-NEXT: fucomp %st(2)
+; X87_WIN-NEXT: fucom %st(1)
; X87_WIN-NEXT: fnstsw %ax
+; X87_WIN-NEXT: xorl %edx, %edx
; X87_WIN-NEXT: # kill: def $ah killed $ah killed $ax
; X87_WIN-NEXT: sahf
+; X87_WIN-NEXT: setbe %al
+; X87_WIN-NEXT: fldz
; X87_WIN-NEXT: ja LBB0_2
; X87_WIN-NEXT: # %bb.1:
-; X87_WIN-NEXT: fstp %st(1)
+; X87_WIN-NEXT: fstp %st(0)
; X87_WIN-NEXT: fldz
+; X87_WIN-NEXT: fxch %st(1)
; X87_WIN-NEXT: LBB0_2:
-; X87_WIN-NEXT: fstp %st(0)
-; X87_WIN-NEXT: setbe %al
+; X87_WIN-NEXT: fstp %st(1)
+; X87_WIN-NEXT: fsubrp %st, %st(1)
; X87_WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; X87_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X87_WIN-NEXT: orl $3072, %ecx # imm = 0xC00
; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp)
; X87_WIN-NEXT: fistpll {{[0-9]+}}(%esp)
; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp)
-; X87_WIN-NEXT: movzbl %al, %edx
+; X87_WIN-NEXT: movb %al, %dl
; X87_WIN-NEXT: shll $31, %edx
; X87_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X87_WIN-NEXT: movl {{[0-9]+}}(%esp), %eax
; X87_LIN-NEXT: subl $20, %esp
; X87_LIN-NEXT: flds {{[0-9]+}}(%esp)
; X87_LIN-NEXT: flds {{\.LCPI.*}}
-; X87_LIN-NEXT: fld %st(1)
-; X87_LIN-NEXT: fsub %st(1), %st
-; X87_LIN-NEXT: fxch %st(1)
-; X87_LIN-NEXT: fucomp %st(2)
+; X87_LIN-NEXT: fucom %st(1)
; X87_LIN-NEXT: fnstsw %ax
+; X87_LIN-NEXT: xorl %edx, %edx
; X87_LIN-NEXT: # kill: def $ah killed $ah killed $ax
; X87_LIN-NEXT: sahf
+; X87_LIN-NEXT: setbe %al
+; X87_LIN-NEXT: fldz
; X87_LIN-NEXT: ja .LBB0_2
; X87_LIN-NEXT: # %bb.1:
-; X87_LIN-NEXT: fstp %st(1)
+; X87_LIN-NEXT: fstp %st(0)
; X87_LIN-NEXT: fldz
+; X87_LIN-NEXT: fxch %st(1)
; X87_LIN-NEXT: .LBB0_2:
-; X87_LIN-NEXT: fstp %st(0)
-; X87_LIN-NEXT: setbe %al
+; X87_LIN-NEXT: fstp %st(1)
+; X87_LIN-NEXT: fsubrp %st, %st(1)
; X87_LIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; X87_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X87_LIN-NEXT: orl $3072, %ecx # imm = 0xC00
; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp)
; X87_LIN-NEXT: fistpll {{[0-9]+}}(%esp)
; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp)
-; X87_LIN-NEXT: movzbl %al, %edx
+; X87_LIN-NEXT: movb %al, %dl
; X87_LIN-NEXT: shll $31, %edx
; X87_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X87_LIN-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F_32_WIN-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX512F_32_WIN-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX512F_32_WIN-NEXT: vcmpltsd %xmm1, %xmm0, %k1
-; AVX512F_32_WIN-NEXT: vsubsd %xmm1, %xmm0, %xmm2
-; AVX512F_32_WIN-NEXT: vmovsd %xmm0, %xmm2, %xmm2 {%k1}
-; AVX512F_32_WIN-NEXT: vmovsd %xmm2, (%esp)
-; AVX512F_32_WIN-NEXT: fldl (%esp)
-; AVX512F_32_WIN-NEXT: fisttpll (%esp)
+; AVX512F_32_WIN-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX512F_32_WIN-NEXT: xorl %edx, %edx
; AVX512F_32_WIN-NEXT: vucomisd %xmm0, %xmm1
+; AVX512F_32_WIN-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512F_32_WIN-NEXT: vsubsd %xmm1, %xmm0, %xmm0
+; AVX512F_32_WIN-NEXT: vmovsd %xmm0, (%esp)
+; AVX512F_32_WIN-NEXT: fldl (%esp)
+; AVX512F_32_WIN-NEXT: fisttpll (%esp)
; AVX512F_32_WIN-NEXT: setbe %dl
; AVX512F_32_WIN-NEXT: shll $31, %edx
; AVX512F_32_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; AVX512F_32_LIN-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX512F_32_LIN-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX512F_32_LIN-NEXT: vcmpltsd %xmm1, %xmm0, %k1
-; AVX512F_32_LIN-NEXT: vsubsd %xmm1, %xmm0, %xmm2
-; AVX512F_32_LIN-NEXT: vmovsd %xmm0, %xmm2, %xmm2 {%k1}
-; AVX512F_32_LIN-NEXT: vmovsd %xmm2, (%esp)
-; AVX512F_32_LIN-NEXT: fldl (%esp)
-; AVX512F_32_LIN-NEXT: fisttpll (%esp)
+; AVX512F_32_LIN-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX512F_32_LIN-NEXT: xorl %edx, %edx
; AVX512F_32_LIN-NEXT: vucomisd %xmm0, %xmm1
+; AVX512F_32_LIN-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512F_32_LIN-NEXT: vsubsd %xmm1, %xmm0, %xmm0
+; AVX512F_32_LIN-NEXT: vmovsd %xmm0, (%esp)
+; AVX512F_32_LIN-NEXT: fldl (%esp)
+; AVX512F_32_LIN-NEXT: fisttpll (%esp)
; AVX512F_32_LIN-NEXT: setbe %dl
; AVX512F_32_LIN-NEXT: shll $31, %edx
; AVX512F_32_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; SSE3_32_WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE3_32_WIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; SSE3_32_WIN-NEXT: movapd %xmm0, %xmm2
-; SSE3_32_WIN-NEXT: cmpltsd %xmm1, %xmm2
-; SSE3_32_WIN-NEXT: movapd %xmm2, %xmm3
-; SSE3_32_WIN-NEXT: andpd %xmm0, %xmm2
; SSE3_32_WIN-NEXT: xorl %edx, %edx
; SSE3_32_WIN-NEXT: ucomisd %xmm0, %xmm1
-; SSE3_32_WIN-NEXT: subsd %xmm1, %xmm0
-; SSE3_32_WIN-NEXT: andnpd %xmm0, %xmm3
-; SSE3_32_WIN-NEXT: orpd %xmm3, %xmm2
-; SSE3_32_WIN-NEXT: movlpd %xmm2, (%esp)
+; SSE3_32_WIN-NEXT: cmpltsd %xmm1, %xmm0
+; SSE3_32_WIN-NEXT: andnpd %xmm1, %xmm0
+; SSE3_32_WIN-NEXT: subsd %xmm0, %xmm2
+; SSE3_32_WIN-NEXT: movsd %xmm2, (%esp)
; SSE3_32_WIN-NEXT: fldl (%esp)
; SSE3_32_WIN-NEXT: fisttpll (%esp)
; SSE3_32_WIN-NEXT: setbe %dl
; SSE3_32_LIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE3_32_LIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; SSE3_32_LIN-NEXT: movapd %xmm0, %xmm2
-; SSE3_32_LIN-NEXT: cmpltsd %xmm1, %xmm2
-; SSE3_32_LIN-NEXT: movapd %xmm2, %xmm3
-; SSE3_32_LIN-NEXT: andpd %xmm0, %xmm2
; SSE3_32_LIN-NEXT: xorl %edx, %edx
; SSE3_32_LIN-NEXT: ucomisd %xmm0, %xmm1
-; SSE3_32_LIN-NEXT: subsd %xmm1, %xmm0
-; SSE3_32_LIN-NEXT: andnpd %xmm0, %xmm3
-; SSE3_32_LIN-NEXT: orpd %xmm3, %xmm2
-; SSE3_32_LIN-NEXT: movlpd %xmm2, (%esp)
+; SSE3_32_LIN-NEXT: cmpltsd %xmm1, %xmm0
+; SSE3_32_LIN-NEXT: andnpd %xmm1, %xmm0
+; SSE3_32_LIN-NEXT: subsd %xmm0, %xmm2
+; SSE3_32_LIN-NEXT: movsd %xmm2, (%esp)
; SSE3_32_LIN-NEXT: fldl (%esp)
; SSE3_32_LIN-NEXT: fisttpll (%esp)
; SSE3_32_LIN-NEXT: setbe %dl
; SSE2_32_WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2_32_WIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; SSE2_32_WIN-NEXT: movapd %xmm0, %xmm2
-; SSE2_32_WIN-NEXT: subsd %xmm1, %xmm2
+; SSE2_32_WIN-NEXT: cmpltsd %xmm1, %xmm2
+; SSE2_32_WIN-NEXT: andnpd %xmm1, %xmm2
; SSE2_32_WIN-NEXT: movapd %xmm0, %xmm3
-; SSE2_32_WIN-NEXT: cmpltsd %xmm1, %xmm3
-; SSE2_32_WIN-NEXT: movapd %xmm3, %xmm4
-; SSE2_32_WIN-NEXT: andnpd %xmm2, %xmm4
-; SSE2_32_WIN-NEXT: andpd %xmm0, %xmm3
-; SSE2_32_WIN-NEXT: orpd %xmm4, %xmm3
-; SSE2_32_WIN-NEXT: movlpd %xmm3, {{[0-9]+}}(%esp)
+; SSE2_32_WIN-NEXT: subsd %xmm2, %xmm3
+; SSE2_32_WIN-NEXT: movsd %xmm3, {{[0-9]+}}(%esp)
; SSE2_32_WIN-NEXT: fldl {{[0-9]+}}(%esp)
; SSE2_32_WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; SSE2_32_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; SSE2_32_LIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2_32_LIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; SSE2_32_LIN-NEXT: movapd %xmm0, %xmm2
-; SSE2_32_LIN-NEXT: subsd %xmm1, %xmm2
+; SSE2_32_LIN-NEXT: cmpltsd %xmm1, %xmm2
+; SSE2_32_LIN-NEXT: andnpd %xmm1, %xmm2
; SSE2_32_LIN-NEXT: movapd %xmm0, %xmm3
-; SSE2_32_LIN-NEXT: cmpltsd %xmm1, %xmm3
-; SSE2_32_LIN-NEXT: movapd %xmm3, %xmm4
-; SSE2_32_LIN-NEXT: andnpd %xmm2, %xmm4
-; SSE2_32_LIN-NEXT: andpd %xmm0, %xmm3
-; SSE2_32_LIN-NEXT: orpd %xmm4, %xmm3
-; SSE2_32_LIN-NEXT: movlpd %xmm3, {{[0-9]+}}(%esp)
+; SSE2_32_LIN-NEXT: subsd %xmm2, %xmm3
+; SSE2_32_LIN-NEXT: movsd %xmm3, {{[0-9]+}}(%esp)
; SSE2_32_LIN-NEXT: fldl {{[0-9]+}}(%esp)
; SSE2_32_LIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; SSE2_32_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X87_WIN-NEXT: subl $16, %esp
; X87_WIN-NEXT: fldl 8(%ebp)
; X87_WIN-NEXT: flds __real@5f000000
-; X87_WIN-NEXT: fld %st(1)
-; X87_WIN-NEXT: fsub %st(1), %st
-; X87_WIN-NEXT: fxch %st(1)
-; X87_WIN-NEXT: fucomp %st(2)
+; X87_WIN-NEXT: fucom %st(1)
; X87_WIN-NEXT: fnstsw %ax
+; X87_WIN-NEXT: xorl %edx, %edx
; X87_WIN-NEXT: # kill: def $ah killed $ah killed $ax
; X87_WIN-NEXT: sahf
+; X87_WIN-NEXT: setbe %al
+; X87_WIN-NEXT: fldz
; X87_WIN-NEXT: ja LBB2_2
; X87_WIN-NEXT: # %bb.1:
-; X87_WIN-NEXT: fstp %st(1)
+; X87_WIN-NEXT: fstp %st(0)
; X87_WIN-NEXT: fldz
+; X87_WIN-NEXT: fxch %st(1)
; X87_WIN-NEXT: LBB2_2:
-; X87_WIN-NEXT: fstp %st(0)
-; X87_WIN-NEXT: setbe %al
+; X87_WIN-NEXT: fstp %st(1)
+; X87_WIN-NEXT: fsubrp %st, %st(1)
; X87_WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; X87_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X87_WIN-NEXT: orl $3072, %ecx # imm = 0xC00
; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp)
; X87_WIN-NEXT: fistpll {{[0-9]+}}(%esp)
; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp)
-; X87_WIN-NEXT: movzbl %al, %edx
+; X87_WIN-NEXT: movb %al, %dl
; X87_WIN-NEXT: shll $31, %edx
; X87_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X87_WIN-NEXT: movl {{[0-9]+}}(%esp), %eax
; X87_LIN-NEXT: subl $20, %esp
; X87_LIN-NEXT: fldl {{[0-9]+}}(%esp)
; X87_LIN-NEXT: flds {{\.LCPI.*}}
-; X87_LIN-NEXT: fld %st(1)
-; X87_LIN-NEXT: fsub %st(1), %st
-; X87_LIN-NEXT: fxch %st(1)
-; X87_LIN-NEXT: fucomp %st(2)
+; X87_LIN-NEXT: fucom %st(1)
; X87_LIN-NEXT: fnstsw %ax
+; X87_LIN-NEXT: xorl %edx, %edx
; X87_LIN-NEXT: # kill: def $ah killed $ah killed $ax
; X87_LIN-NEXT: sahf
+; X87_LIN-NEXT: setbe %al
+; X87_LIN-NEXT: fldz
; X87_LIN-NEXT: ja .LBB2_2
; X87_LIN-NEXT: # %bb.1:
-; X87_LIN-NEXT: fstp %st(1)
+; X87_LIN-NEXT: fstp %st(0)
; X87_LIN-NEXT: fldz
+; X87_LIN-NEXT: fxch %st(1)
; X87_LIN-NEXT: .LBB2_2:
-; X87_LIN-NEXT: fstp %st(0)
-; X87_LIN-NEXT: setbe %al
+; X87_LIN-NEXT: fstp %st(1)
+; X87_LIN-NEXT: fsubrp %st, %st(1)
; X87_LIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; X87_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X87_LIN-NEXT: orl $3072, %ecx # imm = 0xC00
; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp)
; X87_LIN-NEXT: fistpll {{[0-9]+}}(%esp)
; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp)
-; X87_LIN-NEXT: movzbl %al, %edx
+; X87_LIN-NEXT: movb %al, %dl
; X87_LIN-NEXT: shll $31, %edx
; X87_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X87_LIN-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512_32_WIN-NEXT: subl $8, %esp
; AVX512_32_WIN-NEXT: fldt 8(%ebp)
; AVX512_32_WIN-NEXT: flds __real@5f000000
-; AVX512_32_WIN-NEXT: fld %st(1)
-; AVX512_32_WIN-NEXT: fsub %st(1), %st
; AVX512_32_WIN-NEXT: xorl %edx, %edx
+; AVX512_32_WIN-NEXT: fucomi %st(1), %st
+; AVX512_32_WIN-NEXT: fldz
; AVX512_32_WIN-NEXT: fxch %st(1)
-; AVX512_32_WIN-NEXT: fucompi %st(2), %st
; AVX512_32_WIN-NEXT: fcmovnbe %st(1), %st
; AVX512_32_WIN-NEXT: fstp %st(1)
+; AVX512_32_WIN-NEXT: fsubrp %st, %st(1)
; AVX512_32_WIN-NEXT: fisttpll (%esp)
; AVX512_32_WIN-NEXT: setbe %dl
; AVX512_32_WIN-NEXT: shll $31, %edx
; AVX512_32_LIN-NEXT: subl $12, %esp
; AVX512_32_LIN-NEXT: fldt {{[0-9]+}}(%esp)
; AVX512_32_LIN-NEXT: flds {{\.LCPI.*}}
-; AVX512_32_LIN-NEXT: fld %st(1)
-; AVX512_32_LIN-NEXT: fsub %st(1), %st
; AVX512_32_LIN-NEXT: xorl %edx, %edx
+; AVX512_32_LIN-NEXT: fucomi %st(1), %st
+; AVX512_32_LIN-NEXT: fldz
; AVX512_32_LIN-NEXT: fxch %st(1)
-; AVX512_32_LIN-NEXT: fucompi %st(2), %st
; AVX512_32_LIN-NEXT: fcmovnbe %st(1), %st
; AVX512_32_LIN-NEXT: fstp %st(1)
+; AVX512_32_LIN-NEXT: fsubrp %st, %st(1)
; AVX512_32_LIN-NEXT: fisttpll (%esp)
; AVX512_32_LIN-NEXT: setbe %dl
; AVX512_32_LIN-NEXT: shll $31, %edx
; AVX512_64_WIN-NEXT: pushq %rax
; AVX512_64_WIN-NEXT: fldt (%rcx)
; AVX512_64_WIN-NEXT: flds __real@{{.*}}(%rip)
-; AVX512_64_WIN-NEXT: fld %st(1)
-; AVX512_64_WIN-NEXT: fsub %st(1), %st
; AVX512_64_WIN-NEXT: xorl %eax, %eax
+; AVX512_64_WIN-NEXT: fucomi %st(1), %st
+; AVX512_64_WIN-NEXT: fldz
; AVX512_64_WIN-NEXT: fxch %st(1)
-; AVX512_64_WIN-NEXT: fucompi %st(2), %st
; AVX512_64_WIN-NEXT: fcmovnbe %st(1), %st
; AVX512_64_WIN-NEXT: fstp %st(1)
+; AVX512_64_WIN-NEXT: fsubrp %st, %st(1)
; AVX512_64_WIN-NEXT: fisttpll (%rsp)
; AVX512_64_WIN-NEXT: setbe %al
; AVX512_64_WIN-NEXT: shlq $63, %rax
; AVX512_64_LIN: # %bb.0:
; AVX512_64_LIN-NEXT: fldt {{[0-9]+}}(%rsp)
; AVX512_64_LIN-NEXT: flds {{.*}}(%rip)
-; AVX512_64_LIN-NEXT: fld %st(1)
-; AVX512_64_LIN-NEXT: fsub %st(1), %st
; AVX512_64_LIN-NEXT: xorl %eax, %eax
+; AVX512_64_LIN-NEXT: fucomi %st(1), %st
+; AVX512_64_LIN-NEXT: fldz
; AVX512_64_LIN-NEXT: fxch %st(1)
-; AVX512_64_LIN-NEXT: fucompi %st(2), %st
; AVX512_64_LIN-NEXT: fcmovnbe %st(1), %st
; AVX512_64_LIN-NEXT: fstp %st(1)
+; AVX512_64_LIN-NEXT: fsubrp %st, %st(1)
; AVX512_64_LIN-NEXT: fisttpll -{{[0-9]+}}(%rsp)
; AVX512_64_LIN-NEXT: setbe %al
; AVX512_64_LIN-NEXT: shlq $63, %rax
; SSE3_32_WIN-NEXT: subl $8, %esp
; SSE3_32_WIN-NEXT: fldt 8(%ebp)
; SSE3_32_WIN-NEXT: flds __real@5f000000
-; SSE3_32_WIN-NEXT: fld %st(1)
-; SSE3_32_WIN-NEXT: fsub %st(1), %st
; SSE3_32_WIN-NEXT: xorl %edx, %edx
+; SSE3_32_WIN-NEXT: fucomi %st(1), %st
+; SSE3_32_WIN-NEXT: fldz
; SSE3_32_WIN-NEXT: fxch %st(1)
-; SSE3_32_WIN-NEXT: fucompi %st(2), %st
; SSE3_32_WIN-NEXT: fcmovnbe %st(1), %st
; SSE3_32_WIN-NEXT: fstp %st(1)
+; SSE3_32_WIN-NEXT: fsubrp %st, %st(1)
; SSE3_32_WIN-NEXT: fisttpll (%esp)
; SSE3_32_WIN-NEXT: setbe %dl
; SSE3_32_WIN-NEXT: shll $31, %edx
; SSE3_32_LIN-NEXT: subl $12, %esp
; SSE3_32_LIN-NEXT: fldt {{[0-9]+}}(%esp)
; SSE3_32_LIN-NEXT: flds {{\.LCPI.*}}
-; SSE3_32_LIN-NEXT: fld %st(1)
-; SSE3_32_LIN-NEXT: fsub %st(1), %st
; SSE3_32_LIN-NEXT: xorl %edx, %edx
+; SSE3_32_LIN-NEXT: fucomi %st(1), %st
+; SSE3_32_LIN-NEXT: fldz
; SSE3_32_LIN-NEXT: fxch %st(1)
-; SSE3_32_LIN-NEXT: fucompi %st(2), %st
; SSE3_32_LIN-NEXT: fcmovnbe %st(1), %st
; SSE3_32_LIN-NEXT: fstp %st(1)
+; SSE3_32_LIN-NEXT: fsubrp %st, %st(1)
; SSE3_32_LIN-NEXT: fisttpll (%esp)
; SSE3_32_LIN-NEXT: setbe %dl
; SSE3_32_LIN-NEXT: shll $31, %edx
; SSE3_64_WIN-NEXT: pushq %rax
; SSE3_64_WIN-NEXT: fldt (%rcx)
; SSE3_64_WIN-NEXT: flds __real@{{.*}}(%rip)
-; SSE3_64_WIN-NEXT: fld %st(1)
-; SSE3_64_WIN-NEXT: fsub %st(1), %st
; SSE3_64_WIN-NEXT: xorl %eax, %eax
+; SSE3_64_WIN-NEXT: fucomi %st(1), %st
+; SSE3_64_WIN-NEXT: fldz
; SSE3_64_WIN-NEXT: fxch %st(1)
-; SSE3_64_WIN-NEXT: fucompi %st(2), %st
; SSE3_64_WIN-NEXT: fcmovnbe %st(1), %st
; SSE3_64_WIN-NEXT: fstp %st(1)
+; SSE3_64_WIN-NEXT: fsubrp %st, %st(1)
; SSE3_64_WIN-NEXT: fisttpll (%rsp)
; SSE3_64_WIN-NEXT: setbe %al
; SSE3_64_WIN-NEXT: shlq $63, %rax
; SSE3_64_LIN: # %bb.0:
; SSE3_64_LIN-NEXT: fldt {{[0-9]+}}(%rsp)
; SSE3_64_LIN-NEXT: flds {{.*}}(%rip)
-; SSE3_64_LIN-NEXT: fld %st(1)
-; SSE3_64_LIN-NEXT: fsub %st(1), %st
; SSE3_64_LIN-NEXT: xorl %eax, %eax
+; SSE3_64_LIN-NEXT: fucomi %st(1), %st
+; SSE3_64_LIN-NEXT: fldz
; SSE3_64_LIN-NEXT: fxch %st(1)
-; SSE3_64_LIN-NEXT: fucompi %st(2), %st
; SSE3_64_LIN-NEXT: fcmovnbe %st(1), %st
; SSE3_64_LIN-NEXT: fstp %st(1)
+; SSE3_64_LIN-NEXT: fsubrp %st, %st(1)
; SSE3_64_LIN-NEXT: fisttpll -{{[0-9]+}}(%rsp)
; SSE3_64_LIN-NEXT: setbe %al
; SSE3_64_LIN-NEXT: shlq $63, %rax
; SSE2_32_WIN-NEXT: subl $16, %esp
; SSE2_32_WIN-NEXT: fldt 8(%ebp)
; SSE2_32_WIN-NEXT: flds __real@5f000000
-; SSE2_32_WIN-NEXT: fld %st(1)
-; SSE2_32_WIN-NEXT: fsub %st(1), %st
; SSE2_32_WIN-NEXT: xorl %edx, %edx
+; SSE2_32_WIN-NEXT: fucomi %st(1), %st
+; SSE2_32_WIN-NEXT: setbe %dl
+; SSE2_32_WIN-NEXT: fldz
; SSE2_32_WIN-NEXT: fxch %st(1)
-; SSE2_32_WIN-NEXT: fucompi %st(2), %st
; SSE2_32_WIN-NEXT: fcmovnbe %st(1), %st
; SSE2_32_WIN-NEXT: fstp %st(1)
-; SSE2_32_WIN-NEXT: setbe %dl
+; SSE2_32_WIN-NEXT: fsubrp %st, %st(1)
; SSE2_32_WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; SSE2_32_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; SSE2_32_WIN-NEXT: orl $3072, %eax # imm = 0xC00
; SSE2_32_LIN-NEXT: subl $20, %esp
; SSE2_32_LIN-NEXT: fldt {{[0-9]+}}(%esp)
; SSE2_32_LIN-NEXT: flds {{\.LCPI.*}}
-; SSE2_32_LIN-NEXT: fld %st(1)
-; SSE2_32_LIN-NEXT: fsub %st(1), %st
; SSE2_32_LIN-NEXT: xorl %edx, %edx
+; SSE2_32_LIN-NEXT: fucomi %st(1), %st
+; SSE2_32_LIN-NEXT: setbe %dl
+; SSE2_32_LIN-NEXT: fldz
; SSE2_32_LIN-NEXT: fxch %st(1)
-; SSE2_32_LIN-NEXT: fucompi %st(2), %st
; SSE2_32_LIN-NEXT: fcmovnbe %st(1), %st
; SSE2_32_LIN-NEXT: fstp %st(1)
-; SSE2_32_LIN-NEXT: setbe %dl
+; SSE2_32_LIN-NEXT: fsubrp %st, %st(1)
; SSE2_32_LIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; SSE2_32_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; SSE2_32_LIN-NEXT: orl $3072, %eax # imm = 0xC00
; SSE2_64_WIN-NEXT: subq $16, %rsp
; SSE2_64_WIN-NEXT: fldt (%rcx)
; SSE2_64_WIN-NEXT: flds __real@{{.*}}(%rip)
-; SSE2_64_WIN-NEXT: fld %st(1)
-; SSE2_64_WIN-NEXT: fsub %st(1), %st
; SSE2_64_WIN-NEXT: xorl %eax, %eax
+; SSE2_64_WIN-NEXT: fucomi %st(1), %st
+; SSE2_64_WIN-NEXT: setbe %al
+; SSE2_64_WIN-NEXT: fldz
; SSE2_64_WIN-NEXT: fxch %st(1)
-; SSE2_64_WIN-NEXT: fucompi %st(2), %st
; SSE2_64_WIN-NEXT: fcmovnbe %st(1), %st
; SSE2_64_WIN-NEXT: fstp %st(1)
-; SSE2_64_WIN-NEXT: setbe %al
+; SSE2_64_WIN-NEXT: fsubrp %st, %st(1)
; SSE2_64_WIN-NEXT: fnstcw {{[0-9]+}}(%rsp)
; SSE2_64_WIN-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; SSE2_64_WIN-NEXT: orl $3072, %ecx # imm = 0xC00
; SSE2_64_LIN: # %bb.0:
; SSE2_64_LIN-NEXT: fldt {{[0-9]+}}(%rsp)
; SSE2_64_LIN-NEXT: flds {{.*}}(%rip)
-; SSE2_64_LIN-NEXT: fld %st(1)
-; SSE2_64_LIN-NEXT: fsub %st(1), %st
; SSE2_64_LIN-NEXT: xorl %eax, %eax
+; SSE2_64_LIN-NEXT: fucomi %st(1), %st
+; SSE2_64_LIN-NEXT: setbe %al
+; SSE2_64_LIN-NEXT: fldz
; SSE2_64_LIN-NEXT: fxch %st(1)
-; SSE2_64_LIN-NEXT: fucompi %st(2), %st
; SSE2_64_LIN-NEXT: fcmovnbe %st(1), %st
; SSE2_64_LIN-NEXT: fstp %st(1)
-; SSE2_64_LIN-NEXT: setbe %al
+; SSE2_64_LIN-NEXT: fsubrp %st, %st(1)
; SSE2_64_LIN-NEXT: fnstcw -{{[0-9]+}}(%rsp)
; SSE2_64_LIN-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx
; SSE2_64_LIN-NEXT: orl $3072, %ecx # imm = 0xC00
; X87_WIN-NEXT: subl $16, %esp
; X87_WIN-NEXT: fldt 8(%ebp)
; X87_WIN-NEXT: flds __real@5f000000
-; X87_WIN-NEXT: fld %st(1)
-; X87_WIN-NEXT: fsub %st(1), %st
-; X87_WIN-NEXT: fxch %st(1)
-; X87_WIN-NEXT: fucomp %st(2)
+; X87_WIN-NEXT: fucom %st(1)
; X87_WIN-NEXT: fnstsw %ax
+; X87_WIN-NEXT: xorl %edx, %edx
; X87_WIN-NEXT: # kill: def $ah killed $ah killed $ax
; X87_WIN-NEXT: sahf
+; X87_WIN-NEXT: setbe %al
+; X87_WIN-NEXT: fldz
; X87_WIN-NEXT: ja LBB4_2
; X87_WIN-NEXT: # %bb.1:
-; X87_WIN-NEXT: fstp %st(1)
+; X87_WIN-NEXT: fstp %st(0)
; X87_WIN-NEXT: fldz
+; X87_WIN-NEXT: fxch %st(1)
; X87_WIN-NEXT: LBB4_2:
-; X87_WIN-NEXT: fstp %st(0)
-; X87_WIN-NEXT: setbe %al
+; X87_WIN-NEXT: fstp %st(1)
+; X87_WIN-NEXT: fsubrp %st, %st(1)
; X87_WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; X87_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X87_WIN-NEXT: orl $3072, %ecx # imm = 0xC00
; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp)
; X87_WIN-NEXT: fistpll {{[0-9]+}}(%esp)
; X87_WIN-NEXT: fldcw {{[0-9]+}}(%esp)
-; X87_WIN-NEXT: movzbl %al, %edx
+; X87_WIN-NEXT: movb %al, %dl
; X87_WIN-NEXT: shll $31, %edx
; X87_WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X87_WIN-NEXT: movl {{[0-9]+}}(%esp), %eax
; X87_LIN-NEXT: subl $20, %esp
; X87_LIN-NEXT: fldt {{[0-9]+}}(%esp)
; X87_LIN-NEXT: flds {{\.LCPI.*}}
-; X87_LIN-NEXT: fld %st(1)
-; X87_LIN-NEXT: fsub %st(1), %st
-; X87_LIN-NEXT: fxch %st(1)
-; X87_LIN-NEXT: fucomp %st(2)
+; X87_LIN-NEXT: fucom %st(1)
; X87_LIN-NEXT: fnstsw %ax
+; X87_LIN-NEXT: xorl %edx, %edx
; X87_LIN-NEXT: # kill: def $ah killed $ah killed $ax
; X87_LIN-NEXT: sahf
+; X87_LIN-NEXT: setbe %al
+; X87_LIN-NEXT: fldz
; X87_LIN-NEXT: ja .LBB4_2
; X87_LIN-NEXT: # %bb.1:
-; X87_LIN-NEXT: fstp %st(1)
+; X87_LIN-NEXT: fstp %st(0)
; X87_LIN-NEXT: fldz
+; X87_LIN-NEXT: fxch %st(1)
; X87_LIN-NEXT: .LBB4_2:
-; X87_LIN-NEXT: fstp %st(0)
-; X87_LIN-NEXT: setbe %al
+; X87_LIN-NEXT: fstp %st(1)
+; X87_LIN-NEXT: fsubrp %st, %st(1)
; X87_LIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; X87_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X87_LIN-NEXT: orl $3072, %ecx # imm = 0xC00
; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp)
; X87_LIN-NEXT: fistpll {{[0-9]+}}(%esp)
; X87_LIN-NEXT: fldcw {{[0-9]+}}(%esp)
-; X87_LIN-NEXT: movzbl %al, %edx
+; X87_LIN-NEXT: movb %al, %dl
; X87_LIN-NEXT: shll $31, %edx
; X87_LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X87_LIN-NEXT: movl {{[0-9]+}}(%esp), %eax
define <1 x i64> @constrained_vector_fptoui_v1i64_v1f32() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v1i64_v1f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: subss %xmm0, %xmm1
+; CHECK-NEXT: cvttss2si %xmm1, %rax
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v1i64_v1f32:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vsubss %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vcvttss2si %xmm0, %rax
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v1i64_v1f32:
define <2 x i64> @constrained_vector_fptoui_v2i64_v2f32() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v2i64_v2f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: subss %xmm0, %xmm1
+; CHECK-NEXT: cvttss2si %xmm1, %rax
; CHECK-NEXT: movq %rax, %xmm1
-; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
+; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT: subss %xmm0, %xmm2
+; CHECK-NEXT: cvttss2si %xmm2, %rax
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v2i64_v2f32:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
-; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vsubss %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vcvttss2si %xmm1, %rax
; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX1-NEXT: vsubss %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vcvttss2si %xmm0, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v2i64_v2f32:
define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v3i64_v3f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
-; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rdx
-; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rcx
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: subss %xmm0, %xmm1
+; CHECK-NEXT: cvttss2si %xmm1, %rax
+; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: subss %xmm0, %xmm1
+; CHECK-NEXT: cvttss2si %xmm1, %rdx
+; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: subss %xmm0, %xmm1
+; CHECK-NEXT: cvttss2si %xmm1, %rcx
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v3i64_v3f32:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
-; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vsubss %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vcvttss2si %xmm1, %rax
; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX1-NEXT: vsubss %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vcvttss2si %xmm2, %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX1-NEXT: vsubss %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vcvttss2si %xmm0, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v3i64_v3f32:
define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v4i64_v4f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
-; CHECK-NEXT: movq %rax, %xmm1
-; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: subss %xmm1, %xmm0
+; CHECK-NEXT: cvttss2si %xmm0, %rax
+; CHECK-NEXT: movq %rax, %xmm2
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: subss %xmm1, %xmm0
+; CHECK-NEXT: cvttss2si %xmm0, %rax
; CHECK-NEXT: movq %rax, %xmm0
-; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT: subss %xmm1, %xmm2
+; CHECK-NEXT: cvttss2si %xmm2, %rax
; CHECK-NEXT: movq %rax, %xmm2
-; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
+; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT: subss %xmm1, %xmm3
+; CHECK-NEXT: cvttss2si %xmm3, %rax
; CHECK-NEXT: movq %rax, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v4i64_v4f32:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
-; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vsubss %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vcvttss2si %xmm1, %rax
; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
+; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX1-NEXT: vsubss %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vcvttss2si %xmm2, %rax
; AVX1-NEXT: vmovq %rax, %xmm2
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX1-NEXT: vsubss %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vcvttss2si %xmm2, %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX1-NEXT: vsubss %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vcvttss2si %xmm0, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v4i64_v4f32:
define <1 x i64> @constrained_vector_fptoui_v1i64_v1f64() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v1i64_v1f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
+; CHECK-NEXT: xorpd %xmm0, %xmm0
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: subsd %xmm0, %xmm1
+; CHECK-NEXT: cvttsd2si %xmm1, %rax
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v1i64_v1f64:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
+; AVX1-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vsubsd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vcvttsd2si %xmm0, %rax
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v1i64_v1f64:
define <2 x i64> @constrained_vector_fptoui_v2i64_v2f64() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v2i64_v2f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
+; CHECK-NEXT: xorpd %xmm0, %xmm0
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: subsd %xmm0, %xmm1
+; CHECK-NEXT: cvttsd2si %xmm1, %rax
; CHECK-NEXT: movq %rax, %xmm1
-; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
+; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT: subsd %xmm0, %xmm2
+; CHECK-NEXT: cvttsd2si %xmm2, %rax
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v2i64_v2f64:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
-; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
+; AVX1-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vsubsd %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vcvttsd2si %xmm1, %rax
; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vsubsd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vcvttsd2si %xmm0, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v2i64_v2f64:
define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v3i64_v3f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
-; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rdx
-; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rcx
+; CHECK-NEXT: xorpd %xmm0, %xmm0
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: subsd %xmm0, %xmm1
+; CHECK-NEXT: cvttsd2si %xmm1, %rax
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: subsd %xmm0, %xmm1
+; CHECK-NEXT: cvttsd2si %xmm1, %rdx
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: subsd %xmm0, %xmm1
+; CHECK-NEXT: cvttsd2si %xmm1, %rcx
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v3i64_v3f64:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
-; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
+; AVX1-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vsubsd %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vcvttsd2si %xmm1, %rax
; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vsubsd %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vcvttsd2si %xmm2, %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vsubsd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vcvttsd2si %xmm0, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v3i64_v3f64:
define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v4i64_v4f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
-; CHECK-NEXT: movq %rax, %xmm1
-; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
+; CHECK-NEXT: xorpd %xmm1, %xmm1
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: subsd %xmm1, %xmm0
+; CHECK-NEXT: cvttsd2si %xmm0, %rax
+; CHECK-NEXT: movq %rax, %xmm2
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: subsd %xmm1, %xmm0
+; CHECK-NEXT: cvttsd2si %xmm0, %rax
; CHECK-NEXT: movq %rax, %xmm0
-; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT: subsd %xmm1, %xmm2
+; CHECK-NEXT: cvttsd2si %xmm2, %rax
; CHECK-NEXT: movq %rax, %xmm2
-; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
+; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
+; CHECK-NEXT: subsd %xmm1, %xmm3
+; CHECK-NEXT: cvttsd2si %xmm3, %rax
; CHECK-NEXT: movq %rax, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v4i64_v4f64:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
-; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
+; AVX1-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vsubsd %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vcvttsd2si %xmm1, %rax
; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
+; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vsubsd %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vcvttsd2si %xmm2, %rax
; AVX1-NEXT: vmovq %rax, %xmm2
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vsubsd %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vcvttsd2si %xmm2, %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX1-NEXT: vsubsd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vcvttsd2si %xmm0, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v4i64_v4f64: