setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
return Sub;
}
+static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget, SDLoc &DL) {
+ if (Op.getSimpleValueType() != MVT::v2f64)
+ return SDValue();
+
+ SDValue N0 = Op.getOperand(0);
+ assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
+
+ // Legalize to v4i32 type.
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+ DAG.getUNDEF(MVT::v2i32));
+
+ if (Subtarget.hasAVX512())
+ return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
+
+ // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
+ // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
+ SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
+ SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
+
+ // Two to the power of half-word-size.
+ SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
+
+ // Clear upper part of LO, lower HI.
+ SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
+ SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
+
+ SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
+ fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
+ SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
+
+ // Add the two halves.
+ return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
+}
+
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// The algorithm is the following:
switch (SrcVT.SimpleTy) {
default:
llvm_unreachable("Custom UINT_TO_FP is not supported!");
- case MVT::v2i32: {
- if (VT == MVT::v2f64)
- return DAG.getNode(X86ISD::CVTUI2P, dl, VT,
- DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, N0,
- DAG.getUNDEF(SrcVT)));
- return SDValue();
- }
case MVT::v4i8:
case MVT::v4i16:
case MVT::v8i8:
return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
}
+ case MVT::v2i32:
+ return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
case MVT::v4i32:
case MVT::v8i32:
return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
; SSE-LABEL: uitofp_2i32_to_2f64:
; SSE: # BB#0:
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
-; SSE-NEXT: subpd %xmm3, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE-NEXT: addpd %xmm4, %xmm0
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: subpd %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE-NEXT: addpd %xmm2, %xmm1
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT: mulpd {{.*}}(%rip), %xmm0
+; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: retq
;
; VEX-LABEL: uitofp_2i32_to_2f64:
; VEX: # BB#0:
-; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; VEX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VEX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
-; VEX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
-; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VEX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
-; VEX-NEXT: vhaddpd %xmm0, %xmm2, %xmm0
+; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1
+; VEX-NEXT: vpsrld $16, %xmm0, %xmm0
+; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
+; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0
+; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; VEX-NEXT: retq
;
; AVX512F-LABEL: uitofp_2i32_to_2f64:
define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
; SSE-LABEL: uitofp_4i32_to_2f64:
; SSE: # BB#0:
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
-; SSE-NEXT: subpd %xmm3, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE-NEXT: addpd %xmm4, %xmm0
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: subpd %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE-NEXT: addpd %xmm2, %xmm1
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT: mulpd {{.*}}(%rip), %xmm0
+; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_4i32_to_2f64:
define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
; SSE-LABEL: uitofp_4i32_to_4f64:
; SSE: # BB#0:
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT: movapd {{.*#+}} xmm2 = [6.553600e+04,6.553600e+04]
+; SSE-NEXT: mulpd %xmm2, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25]
-; SSE-NEXT: subpd %xmm5, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
-; SSE-NEXT: addpd %xmm6, %xmm0
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE-NEXT: subpd %xmm5, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
-; SSE-NEXT: addpd %xmm4, %xmm6
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT: subpd %xmm5, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE-NEXT: addpd %xmm2, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE-NEXT: subpd %xmm5, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
-; SSE-NEXT: addpd %xmm4, %xmm2
-; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm4, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm5
+; SSE-NEXT: mulpd %xmm2, %xmm5
+; SSE-NEXT: pand %xmm3, %xmm4
+; SSE-NEXT: cvtdq2pd %xmm4, %xmm1
+; SSE-NEXT: addpd %xmm5, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_4i32_to_4f64:
define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
; SSE-LABEL: uitofp_load_2i32_to_2f64:
; SSE: # BB#0:
-; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: pxor %xmm0, %xmm0
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
-; SSE-NEXT: subpd %xmm4, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT: mulpd {{.*}}(%rip), %xmm0
; SSE-NEXT: addpd %xmm1, %xmm0
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT: subpd %xmm4, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
-; SSE-NEXT: addpd %xmm3, %xmm1
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; VEX-LABEL: uitofp_load_2i32_to_2f64:
; VEX: # BB#0:
-; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; VEX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VEX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
-; VEX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
-; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VEX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
-; VEX-NEXT: vhaddpd %xmm0, %xmm2, %xmm0
+; VEX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1
+; VEX-NEXT: vpsrld $16, %xmm0, %xmm0
+; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
+; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0
+; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; VEX-NEXT: retq
;
; AVX512F-LABEL: uitofp_load_2i32_to_2f64:
define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
; SSE-LABEL: uitofp_load_4i32_to_4f64:
; SSE: # BB#0:
-; SSE-NEXT: movdqa (%rdi), %xmm2
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1127219200,1160773632,0,0]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE-NEXT: movapd {{.*#+}} xmm6 = [4.503600e+15,1.934281e+25]
-; SSE-NEXT: subpd %xmm6, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE-NEXT: addpd %xmm3, %xmm0
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; SSE-NEXT: subpd %xmm6, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1]
-; SSE-NEXT: addpd %xmm5, %xmm3
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE-NEXT: subpd %xmm6, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE-NEXT: addpd %xmm2, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE-NEXT: subpd %xmm6, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
-; SSE-NEXT: addpd %xmm3, %xmm2
-; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT: movdqa (%rdi), %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT: movapd {{.*#+}} xmm2 = [6.553600e+04,6.553600e+04]
+; SSE-NEXT: mulpd %xmm2, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm4, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm5
+; SSE-NEXT: mulpd %xmm2, %xmm5
+; SSE-NEXT: pand %xmm3, %xmm4
+; SSE-NEXT: cvtdq2pd %xmm4, %xmm1
+; SSE-NEXT: addpd %xmm5, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_load_4i32_to_4f64: