From: Simon Pilgrim Date: Wed, 23 Nov 2016 22:35:06 +0000 (+0000) Subject: [X86][SSE] Add awareness of (v)cvtpd2dq and vcvtpd2udq implicit zeroing of upper... X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3ce6a545c7be2603bac2fa5c94e7f8eb02d123e0;p=platform%2Fupstream%2Fllvm.git [X86][SSE] Add awareness of (v)cvtpd2dq and vcvtpd2udq implicit zeroing of upper 64-bits of xmm result We've already added the equivalent for (v)cvttpd2dq (rL284459) and vcvttpd2udq llvm-svn: 287835 --- diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index a3e57fa..7d14154 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -6546,13 +6546,20 @@ def : Pat<(v2f64 (X86cvtudq2pd (v4i32 VR128X:$src1))), } let Predicates = [HasAVX512, HasVLX] in { - let AddedComplexity = 15 in - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttpd2dq (v2f64 VR128X:$src)))))), - (VCVTTPD2DQZ128rr VR128:$src)>; - def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttpd2udq (v2f64 VR128X:$src)))))))), - (VCVTTPD2UDQZ128rr VR128:$src)>; + let AddedComplexity = 15 in { + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))), + (VCVTPD2DQZ128rr VR128:$src)>; + def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))))), + (VCVTPD2UDQZ128rr VR128:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttpd2dq (v2f64 VR128X:$src)))))), + (VCVTTPD2DQZ128rr VR128:$src)>; + def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttpd2udq (v2f64 VR128X:$src)))))))), + (VCVTTPD2UDQZ128rr VR128:$src)>; + } } let Predicates = [HasAVX512] in { diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index ba10886..818e552 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -2083,10 +2083,14 @@ def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0>; let Predicates = [HasAVX, NoVLX] in { - let AddedComplexity = 15 in - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))), - (VCVTTPD2DQrr VR128:$src)>; + let AddedComplexity = 15 in { + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), + (VCVTPD2DQrr VR128:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))), + (VCVTTPD2DQrr VR128:$src)>; + } } // Predicates = [HasAVX] def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -2101,10 +2105,14 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>; let Predicates = [UseSSE2] in { - let AddedComplexity = 15 in - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))), - (CVTTPD2DQrr VR128:$src)>; + let AddedComplexity = 15 in { + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), + (CVTPD2DQrr VR128:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))), + (CVTTPD2DQrr VR128:$src)>; + } } // Predicates = [UseSSE2] // Convert packed single to packed double diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index 8991ac9..0fc649f 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -3055,8 +3055,6 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128_zext(<2 x double> %x0, < ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0xe6,0xc8] ; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0xe6,0xc0] -; CHECK-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7e,0xc0] -; CHECK-NEXT: ## xmm0 = xmm0[0],zero ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) @@ -3151,8 +3149,6 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128_zext(<2 x double> %x0, ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8] ; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0] -; CHECK-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7e,0xc0] -; CHECK-NEXT: ## xmm0 = xmm0[0],zero ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll index 13f4d11..1a78729 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -255,22 +255,16 @@ define <2 x i64> @test_mm_cvtpd_epi32_zext(<2 x double> %a0) nounwind { ; SSE-LABEL: test_mm_cvtpd_epi32_zext: ; SSE: ## BB#0: ; SSE-NEXT: cvtpd2dq %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0xe6,0xc0] -; SSE-NEXT: movq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x7e,0xc0] -; SSE-NEXT: ## xmm0 = xmm0[0],zero ; SSE-NEXT: retl ## encoding: [0xc3] ; ; AVX2-LABEL: test_mm_cvtpd_epi32_zext: ; AVX2: ## BB#0: ; AVX2-NEXT: vcvtpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0xe6,0xc0] -; AVX2-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x7e,0xc0] -; AVX2-NEXT: ## xmm0 = xmm0[0],zero ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; SKX-LABEL: test_mm_cvtpd_epi32_zext: ; SKX: ## BB#0: ; SKX-NEXT: vcvtpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0xe6,0xc0] -; SKX-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7e,0xc0] -; SKX-NEXT: ## xmm0 = xmm0[0],zero ; SKX-NEXT: retl ## encoding: [0xc3] %cvt = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0) %res = shufflevector <4 x i32> %cvt, <4 x i32> zeroinitializer, <4 x i32>