From f84083b4dbb1ddb6d2783400f11121f490cdb5a8 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Fri, 8 Mar 2019 21:54:41 +0000 Subject: [PATCH] [x86] scalarize extract element 0 of FP cmp An extension of D58282 noted in PR39665: https://bugs.llvm.org/show_bug.cgi?id=39665 This doesn't answer the request to use movmsk, but that's an independent problem. We need this and probably still need scalarization of FP selects because we can't do that as a target-independent transform (although it seems likely that targets besides x86 should have this transform). llvm-svn: 355741 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 16 ++ llvm/test/CodeGen/X86/extractelement-fp.ll | 10 +- llvm/test/CodeGen/X86/vec_floor.ll | 400 ++++++++--------------------- 3 files changed, 132 insertions(+), 294 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b744600..863b93e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34298,6 +34298,22 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) { if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT) return SDValue(); + // Vector FP compares don't fit the pattern of FP math ops (propagate, not + // extract, the condition code), so deal with those as a special-case. + if (Vec.getOpcode() == ISD::SETCC) { + EVT OpVT = Vec.getOperand(0).getValueType().getScalarType(); + if (OpVT != MVT::f32 && OpVT != MVT::f64) + return SDValue(); + + // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC + SDLoc DL(ExtElt); + SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, + Vec.getOperand(0), Index); + SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, + Vec.getOperand(1), Index); + return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2)); + } + if (VT != MVT::f32 && VT != MVT::f64) return SDValue(); diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll index c669a87..300506a 100644 --- a/llvm/test/CodeGen/X86/extractelement-fp.ll +++ b/llvm/test/CodeGen/X86/extractelement-fp.ll @@ -132,9 +132,8 @@ define double @frem_v4f64(<4 x double> %x, <4 x double> %y) nounwind { define i1 @fcmp_v4f32(<4 x float> %x, <4 x float> %y) nounwind { ; CHECK-LABEL: fcmp_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpextrb $0, %xmm0, %eax -; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: vucomiss %xmm1, %xmm0 +; CHECK-NEXT: seta %al ; CHECK-NEXT: retq %v = fcmp ogt <4 x float> %x, %y %r = extractelement <4 x i1> %v, i32 0 @@ -144,9 +143,8 @@ define i1 @fcmp_v4f32(<4 x float> %x, <4 x float> %y) nounwind { define i1 @fcmp_v4f64(<4 x double> %x, <4 x double> %y) nounwind { ; CHECK-LABEL: fcmp_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vcmpnlepd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpextrb $0, %xmm0, %eax -; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: vucomisd %xmm0, %xmm1 +; CHECK-NEXT: setb %al ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %v = fcmp ugt <4 x double> %x, %y diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll index ef499af..2dfc554 100644 --- a/llvm/test/CodeGen/X86/vec_floor.ll +++ b/llvm/test/CodeGen/X86/vec_floor.ll @@ -1665,47 +1665,28 @@ define <2 x double> @floor_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x float> %w) nounwind { ; SSE41-LABEL: floor_mask_ss_mask8: ; SSE41: ## %bb.0: -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: cmpeqps %xmm1, %xmm3 -; SSE41-NEXT: pextrb $0, %xmm3, %eax -; SSE41-NEXT: testb $1, %al -; SSE41-NEXT: je LBB60_2 -; SSE41-NEXT: ## %bb.1: -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: roundss $9, %xmm0, %xmm2 -; SSE41-NEXT: LBB60_2: -; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: roundss $9, %xmm0, %xmm3 +; SSE41-NEXT: cmpeqss %xmm1, %xmm0 +; SSE41-NEXT: andps %xmm0, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: floor_mask_ss_mask8: ; AVX: ## %bb.0: -; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm3 -; AVX-NEXT: vpextrb $0, %xmm3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: je LBB60_2 -; AVX-NEXT: ## %bb.1: -; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2 -; AVX-NEXT: LBB60_2: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3] +; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq ; -; AVX512F-LABEL: floor_mask_ss_mask8: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm2 {%k1} -; AVX512F-NEXT: vmovaps %xmm2, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: floor_mask_ss_mask8: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1 -; AVX512VL-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm2 {%k1} -; AVX512VL-NEXT: vmovaps %xmm2, %xmm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: floor_mask_ss_mask8: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcmpeqss %xmm1, %xmm0, %k1 +; AVX512-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm2 {%k1} +; AVX512-NEXT: vmovaps %xmm2, %xmm0 +; AVX512-NEXT: retq %mask1 = fcmp oeq <4 x float> %x, %y %mask = extractelement <4 x i1> %mask1, i64 0 %s = extractelement <4 x float> %x, i64 0 @@ -1719,50 +1700,25 @@ define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x flo define <4 x float> @floor_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwind { ; SSE41-LABEL: floor_maskz_ss_mask8: ; SSE41: ## %bb.0: -; SSE41-NEXT: movaps %xmm0, %xmm2 -; SSE41-NEXT: cmpeqps %xmm1, %xmm2 -; SSE41-NEXT: pextrb $0, %xmm2, %eax -; SSE41-NEXT: testb $1, %al -; SSE41-NEXT: jne LBB61_1 -; SSE41-NEXT: ## %bb.2: -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: jmp LBB61_3 -; SSE41-NEXT: LBB61_1: -; SSE41-NEXT: roundss $9, %xmm0, %xmm0 -; SSE41-NEXT: LBB61_3: -; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: roundss $9, %xmm0, %xmm2 +; SSE41-NEXT: cmpeqss %xmm1, %xmm0 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: floor_maskz_ss_mask8: ; AVX: ## %bb.0: -; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpextrb $0, %xmm2, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne LBB61_1 -; AVX-NEXT: ## %bb.2: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq -; AVX-NEXT: LBB61_1: -; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq ; -; AVX512F-LABEL: floor_maskz_ss_mask8: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm0 {%k1} {z} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: floor_maskz_ss_mask8: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1 -; AVX512VL-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm0 {%k1} {z} -; AVX512VL-NEXT: retq +; AVX512-LABEL: floor_maskz_ss_mask8: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcmpeqss %xmm1, %xmm0, %k1 +; AVX512-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: retq %mask1 = fcmp oeq <4 x float> %x, %y %mask = extractelement <4 x i1> %mask1, i64 0 %s = extractelement <4 x float> %x, i64 0 @@ -1775,47 +1731,28 @@ define <4 x float> @floor_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwin define <2 x double> @floor_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x double> %w) nounwind { ; SSE41-LABEL: floor_mask_sd_mask8: ; SSE41: ## %bb.0: -; SSE41-NEXT: movapd %xmm0, %xmm3 -; SSE41-NEXT: cmpeqpd %xmm1, %xmm3 -; SSE41-NEXT: pextrb $0, %xmm3, %eax -; SSE41-NEXT: testb $1, %al -; SSE41-NEXT: je LBB62_2 -; SSE41-NEXT: ## %bb.1: -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: roundsd $9, %xmm0, %xmm2 -; SSE41-NEXT: LBB62_2: -; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: roundsd $9, %xmm0, %xmm3 +; SSE41-NEXT: cmpeqsd %xmm1, %xmm0 +; SSE41-NEXT: andpd %xmm0, %xmm3 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE41-NEXT: retq ; ; AVX-LABEL: floor_mask_sd_mask8: ; AVX: ## %bb.0: -; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm3 -; AVX-NEXT: vpextrb $0, %xmm3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: je LBB62_2 -; AVX-NEXT: ## %bb.1: -; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2 -; AVX-NEXT: LBB62_2: -; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1] +; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0 +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; AVX-NEXT: retq ; -; AVX512F-LABEL: floor_mask_sd_mask8: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm2 {%k1} -; AVX512F-NEXT: vmovapd %xmm2, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: floor_mask_sd_mask8: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 -; AVX512VL-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm2 {%k1} -; AVX512VL-NEXT: vmovapd %xmm2, %xmm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: floor_mask_sd_mask8: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcmpeqsd %xmm1, %xmm0, %k1 +; AVX512-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm2 {%k1} +; AVX512-NEXT: vmovapd %xmm2, %xmm0 +; AVX512-NEXT: retq %mask1 = fcmp oeq <2 x double> %x, %y %mask = extractelement <2 x i1> %mask1, i64 0 %s = extractelement <2 x double> %x, i64 0 @@ -1829,50 +1766,25 @@ define <2 x double> @floor_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x define <2 x double> @floor_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) nounwind { ; SSE41-LABEL: floor_maskz_sd_mask8: ; SSE41: ## %bb.0: -; SSE41-NEXT: movapd %xmm0, %xmm2 -; SSE41-NEXT: cmpeqpd %xmm1, %xmm2 -; SSE41-NEXT: pextrb $0, %xmm2, %eax -; SSE41-NEXT: testb $1, %al -; SSE41-NEXT: jne LBB63_1 -; SSE41-NEXT: ## %bb.2: -; SSE41-NEXT: xorpd %xmm0, %xmm0 -; SSE41-NEXT: jmp LBB63_3 -; SSE41-NEXT: LBB63_1: -; SSE41-NEXT: roundsd $9, %xmm0, %xmm0 -; SSE41-NEXT: LBB63_3: -; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: roundsd $9, %xmm0, %xmm2 +; SSE41-NEXT: cmpeqsd %xmm1, %xmm0 +; SSE41-NEXT: andpd %xmm2, %xmm0 +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE41-NEXT: retq ; ; AVX-LABEL: floor_maskz_sd_mask8: ; AVX: ## %bb.0: -; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpextrb $0, %xmm2, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne LBB63_1 -; AVX-NEXT: ## %bb.2: -; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; AVX-NEXT: retq -; AVX-NEXT: LBB63_1: -; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; AVX-NEXT: retq ; -; AVX512F-LABEL: floor_maskz_sd_mask8: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm0 {%k1} {z} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: floor_maskz_sd_mask8: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 -; AVX512VL-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm0 {%k1} {z} -; AVX512VL-NEXT: retq +; AVX512-LABEL: floor_maskz_sd_mask8: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcmpeqsd %xmm1, %xmm0, %k1 +; AVX512-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: retq %mask1 = fcmp oeq <2 x double> %x, %y %mask = extractelement <2 x i1> %mask1, i64 0 %s = extractelement <2 x double> %x, i64 0 @@ -2729,47 +2641,28 @@ define <2 x double> @ceil_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 % define <4 x float> @ceil_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x float> %w) nounwind { ; SSE41-LABEL: ceil_mask_ss_mask8: ; SSE41: ## %bb.0: -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: cmpeqps %xmm1, %xmm3 -; SSE41-NEXT: pextrb $0, %xmm3, %eax -; SSE41-NEXT: testb $1, %al -; SSE41-NEXT: je LBB86_2 -; SSE41-NEXT: ## %bb.1: -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: roundss $10, %xmm0, %xmm2 -; SSE41-NEXT: LBB86_2: -; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: roundss $10, %xmm0, %xmm3 +; SSE41-NEXT: cmpeqss %xmm1, %xmm0 +; SSE41-NEXT: andps %xmm0, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: ceil_mask_ss_mask8: ; AVX: ## %bb.0: -; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm3 -; AVX-NEXT: vpextrb $0, %xmm3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: je LBB86_2 -; AVX-NEXT: ## %bb.1: -; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2 -; AVX-NEXT: LBB86_2: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3] +; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq ; -; AVX512F-LABEL: ceil_mask_ss_mask8: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm2 {%k1} -; AVX512F-NEXT: vmovaps %xmm2, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: ceil_mask_ss_mask8: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1 -; AVX512VL-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm2 {%k1} -; AVX512VL-NEXT: vmovaps %xmm2, %xmm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: ceil_mask_ss_mask8: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcmpeqss %xmm1, %xmm0, %k1 +; AVX512-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm2 {%k1} +; AVX512-NEXT: vmovaps %xmm2, %xmm0 +; AVX512-NEXT: retq %mask1 = fcmp oeq <4 x float> %x, %y %mask = extractelement <4 x i1> %mask1, i64 0 %s = extractelement <4 x float> %x, i64 0 @@ -2783,50 +2676,25 @@ define <4 x float> @ceil_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x floa define <4 x float> @ceil_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwind { ; SSE41-LABEL: ceil_maskz_ss_mask8: ; SSE41: ## %bb.0: -; SSE41-NEXT: movaps %xmm0, %xmm2 -; SSE41-NEXT: cmpeqps %xmm1, %xmm2 -; SSE41-NEXT: pextrb $0, %xmm2, %eax -; SSE41-NEXT: testb $1, %al -; SSE41-NEXT: jne LBB87_1 -; SSE41-NEXT: ## %bb.2: -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: jmp LBB87_3 -; SSE41-NEXT: LBB87_1: -; SSE41-NEXT: roundss $10, %xmm0, %xmm0 -; SSE41-NEXT: LBB87_3: -; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: roundss $10, %xmm0, %xmm2 +; SSE41-NEXT: cmpeqss %xmm1, %xmm0 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: ceil_maskz_ss_mask8: ; AVX: ## %bb.0: -; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpextrb $0, %xmm2, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne LBB87_1 -; AVX-NEXT: ## %bb.2: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq -; AVX-NEXT: LBB87_1: -; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq ; -; AVX512F-LABEL: ceil_maskz_ss_mask8: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm0 {%k1} {z} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: ceil_maskz_ss_mask8: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1 -; AVX512VL-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm0 {%k1} {z} -; AVX512VL-NEXT: retq +; AVX512-LABEL: ceil_maskz_ss_mask8: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcmpeqss %xmm1, %xmm0, %k1 +; AVX512-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: retq %mask1 = fcmp oeq <4 x float> %x, %y %mask = extractelement <4 x i1> %mask1, i64 0 %s = extractelement <4 x float> %x, i64 0 @@ -2839,47 +2707,28 @@ define <4 x float> @ceil_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwind define <2 x double> @ceil_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x double> %w) nounwind { ; SSE41-LABEL: ceil_mask_sd_mask8: ; SSE41: ## %bb.0: -; SSE41-NEXT: movapd %xmm0, %xmm3 -; SSE41-NEXT: cmpeqpd %xmm1, %xmm3 -; SSE41-NEXT: pextrb $0, %xmm3, %eax -; SSE41-NEXT: testb $1, %al -; SSE41-NEXT: je LBB88_2 -; SSE41-NEXT: ## %bb.1: -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: roundsd $10, %xmm0, %xmm2 -; SSE41-NEXT: LBB88_2: -; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: roundsd $10, %xmm0, %xmm3 +; SSE41-NEXT: cmpeqsd %xmm1, %xmm0 +; SSE41-NEXT: andpd %xmm0, %xmm3 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE41-NEXT: retq ; ; AVX-LABEL: ceil_mask_sd_mask8: ; AVX: ## %bb.0: -; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm3 -; AVX-NEXT: vpextrb $0, %xmm3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: je LBB88_2 -; AVX-NEXT: ## %bb.1: -; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2 -; AVX-NEXT: LBB88_2: -; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1] +; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0 +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; AVX-NEXT: retq ; -; AVX512F-LABEL: ceil_mask_sd_mask8: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm2 {%k1} -; AVX512F-NEXT: vmovapd %xmm2, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: ceil_mask_sd_mask8: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 -; AVX512VL-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm2 {%k1} -; AVX512VL-NEXT: vmovapd %xmm2, %xmm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: ceil_mask_sd_mask8: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcmpeqsd %xmm1, %xmm0, %k1 +; AVX512-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm2 {%k1} +; AVX512-NEXT: vmovapd %xmm2, %xmm0 +; AVX512-NEXT: retq %mask1 = fcmp oeq <2 x double> %x, %y %mask = extractelement <2 x i1> %mask1, i64 0 %s = extractelement <2 x double> %x, i64 0 @@ -2893,50 +2742,25 @@ define <2 x double> @ceil_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x d define <2 x double> @ceil_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) nounwind { ; SSE41-LABEL: ceil_maskz_sd_mask8: ; SSE41: ## %bb.0: -; SSE41-NEXT: movapd %xmm0, %xmm2 -; SSE41-NEXT: cmpeqpd %xmm1, %xmm2 -; SSE41-NEXT: pextrb $0, %xmm2, %eax -; SSE41-NEXT: testb $1, %al -; SSE41-NEXT: jne LBB89_1 -; SSE41-NEXT: ## %bb.2: -; SSE41-NEXT: xorpd %xmm0, %xmm0 -; SSE41-NEXT: jmp LBB89_3 -; SSE41-NEXT: LBB89_1: -; SSE41-NEXT: roundsd $10, %xmm0, %xmm0 -; SSE41-NEXT: LBB89_3: -; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: roundsd $10, %xmm0, %xmm2 +; SSE41-NEXT: cmpeqsd %xmm1, %xmm0 +; SSE41-NEXT: andpd %xmm2, %xmm0 +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE41-NEXT: retq ; ; AVX-LABEL: ceil_maskz_sd_mask8: ; AVX: ## %bb.0: -; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpextrb $0, %xmm2, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne LBB89_1 -; AVX-NEXT: ## %bb.2: -; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; AVX-NEXT: retq -; AVX-NEXT: LBB89_1: -; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; AVX-NEXT: retq ; -; AVX512F-LABEL: ceil_maskz_sd_mask8: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm0 {%k1} {z} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: ceil_maskz_sd_mask8: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 -; AVX512VL-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm0 {%k1} {z} -; AVX512VL-NEXT: retq +; AVX512-LABEL: ceil_maskz_sd_mask8: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcmpeqsd %xmm1, %xmm0, %k1 +; AVX512-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: retq %mask1 = fcmp oeq <2 x double> %x, %y %mask = extractelement <2 x i1> %mask1, i64 0 %s = extractelement <2 x double> %x, i64 0 -- 2.7.4