From: Craig Topper Date: Sun, 15 Jul 2018 18:51:08 +0000 (+0000) Subject: [X86] Use 128-bit blends instead vmovss/vmovsd for 512-bit vzmovl patterns to match... X-Git-Tag: llvmorg-7.0.0-rc1~1389 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ec0038398a3eaac363429d8a76e62d0c406008e4;p=platform%2Fupstream%2Fllvm.git [X86] Use 128-bit blends instead vmovss/vmovsd for 512-bit vzmovl patterns to match AVX. llvm-svn: 337135 --- diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index bfc26258ce4b..c778b48f319e 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -4348,9 +4348,7 @@ let Predicates = [HasAVX512, OptForSize] in { def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)), (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>; -} -let Predicates = [HasAVX512] in { def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))), (SUBREG_TO_REG (i32 0), (VMOVSSZrr (v4f32 (AVX512_128_SET0)), @@ -4360,6 +4358,45 @@ let Predicates = [HasAVX512] in { (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), sub_xmm)>; + def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSDZrr (v2f64 (AVX512_128_SET0)), + (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>; + + def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), + (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)), + (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>; + +} + +// Use 128-bit blends for OptForSpeed since BLENDs have better throughput than +// VMOVSS/SD. Unfortunately, loses the ability to use XMM16-31. +let Predicates = [HasAVX512, OptForSpeed] in { + def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (VBLENDPSrri (v4f32 (V_SET0)), + (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm), + (i8 1)), sub_xmm)>; + def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (VPBLENDWrri (v4i32 (V_SET0)), + (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm), + (i8 3)), sub_xmm)>; + + def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (VBLENDPDrri (v2f64 (V_SET0)), + (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm), + (i8 1)), sub_xmm)>; + def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (VPBLENDWrri (v2i64 (V_SET0)), + (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm), + (i8 0xf)), sub_xmm)>; +} + +let Predicates = [HasAVX512] in { + // MOVSSrm zeros the high parts of the register; represent this // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), @@ -4416,16 +4453,6 @@ let Predicates = [HasAVX512] in { (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>; - // Move low f64 and clear high bits. - def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSDZrr (v2f64 (AVX512_128_SET0)), - (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>; - - def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), - (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)), - (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>; - // Extract and store. def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))), addr:$dst), diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll index 071a6d227057..a720f66a59a7 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -388,7 +388,7 @@ define <16 x i32> @shuffle_v16i32_0zzzzzzzzzzzzzzz(<16 x i32> %a) { ; ALL-LABEL: shuffle_v16i32_0zzzzzzzzzzzzzzz: ; ALL: # %bb.0: ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> zeroinitializer, <16 x i32> ret <16 x i32> %shuffle @@ -398,7 +398,7 @@ define <16 x float> @shuffle_v16f32_0zzzzzzzzzzzzzzz(<16 x float> %a) { ; ALL-LABEL: shuffle_v16f32_0zzzzzzzzzzzzzzz: ; ALL: # %bb.0: ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <16 x i32> ret <16 x float> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll index 6e3dfb1f1927..97b89a0be37b 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -1969,8 +1969,8 @@ define <8 x double> @shuffle_v8f64_uuu2301(<8 x double> %a0, <8 x double> %a1) { define <8 x i64> @shuffle_v8i64_0zzzzzzz(<8 x i64> %a) { ; ALL-LABEL: shuffle_v8i64_0zzzzzzz: ; ALL: # %bb.0: -; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> zeroinitializer, <8 x i32> ret <8 x i64> %shuffle @@ -1979,8 +1979,8 @@ define <8 x i64> @shuffle_v8i64_0zzzzzzz(<8 x i64> %a) { define <8 x double> @shuffle_v8f64_0zzzzzzz(<8 x double> %a) { ; ALL-LABEL: shuffle_v8f64_0zzzzzzz: ; ALL: # %bb.0: -; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <8 x i32> ret <8 x double> %shuffle