From dd09a8f320d8dc0e5e57e68cc4e3d6dbb15ed4a3 Mon Sep 17 00:00:00 2001 From: Robert Khasanov Date: Tue, 28 Oct 2014 12:28:51 +0000 Subject: [PATCH] [AVX512] Bring back vector-shuffle lowering support through broadcasts MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Ffter commit at rev219046 512-bit broadcasts lowering become non-optimal. Most of tests on broadcasting and embedded broadcasting were changed and they doesn’t produce efficient code. Example below is from commit changes (it’s the first test from test/CodeGen/X86/avx512-vbroadcast.ll): define <16 x i32> @_inreg16xi32(i32 %a) { ; CHECK-LABEL: _inreg16xi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastd %edi, %zmm0 +; CHECK-NEXT: vmovd %edi, %xmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; CHECK-NEXT: retq %b = insertelement <16 x i32> undef, i32 %a, i32 0 %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer ret <16 x i32> %c } Here, 256-bit broadcast was generated instead of 512-bit one. In this patch 1) I added vector-shuffle lowering through broadcasts 2) Removed asserts and branches likes because this is incorrect - assert(Subtarget->hasDQI() && "We can only lower v8i64 with AVX-512-DQI"); 3) Fixed lowering tests llvm-svn: 220774 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 15 +++++++-------- llvm/lib/Target/X86/X86InstrAVX512.td | 10 ++++++++++ llvm/test/CodeGen/X86/avx512-arith.ll | 5 +---- llvm/test/CodeGen/X86/avx512-vbroadcast.ll | 22 ++++++---------------- llvm/test/CodeGen/X86/avx512-vec-cmp.ll | 24 ++++++------------------ llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll | 6 ++---- 6 files changed, 32 insertions(+), 50 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 16a21a9..656c1de 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -10231,7 +10231,6 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - assert(Subtarget->hasDQI() && "We can only lower v8i64 with AVX-512-DQI"); // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG); @@ -10247,7 +10246,6 @@ static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - assert(Subtarget->hasDQI() && "We can only lower v16i32 with AVX-512-DQI!"); // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG); @@ -10299,6 +10297,11 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Subtarget->hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"); + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // Dispatch to each element type for lowering. If we don't have supprot for // specific element type shuffles at 512 bits, immediately split them and // lower them. Each lowering routine of a given type is allowed to assume that @@ -10309,13 +10312,9 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, case MVT::v16f32: return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v8i64: - if (Subtarget->hasDQI()) - return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG); - break; + return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v16i32: - if (Subtarget->hasDQI()) - return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG); - break; + return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v32i16: if (Subtarget->hasBWI()) return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 1d1abcf..3dbc3d2 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -711,6 +711,16 @@ def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))), def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))), (VBROADCASTSDZrr VR128X:$src)>; +def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))), + (VBROADCASTSSZrr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; +def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))), + (VBROADCASTSDZrr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>; + +def : Pat<(v16i32 (X86VBroadcast (v16i32 VR512:$src))), + (VPBROADCASTDZrr (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; +def : Pat<(v8i64 (X86VBroadcast (v8i64 VR512:$src))), + (VPBROADCASTQZrr (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; + def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))), (VBROADCASTSSZrr VR128X:$src)>; def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))), diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll index e161b5c..c43da9c 100644 --- a/llvm/test/CodeGen/X86/avx512-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-arith.ll @@ -453,10 +453,7 @@ entry: define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) { ; CHECK-LABEL: andqbrst: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmovq (%rdi), %xmm1 -; CHECK-NEXT: vpbroadcastq %xmm1, %ymm1 -; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpandq (%rdi){1to8}, %zmm0, %zmm0 ; CHECK-NEXT: retq entry: %a = load i64* %ap, align 8 diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll index 99def06..0b0e0fc 100644 --- a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll @@ -3,9 +3,7 @@ define <16 x i32> @_inreg16xi32(i32 %a) { ; CHECK-LABEL: _inreg16xi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovd %edi, %xmm0 -; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 -; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; CHECK-NEXT: vpbroadcastd %edi, %zmm0 ; CHECK-NEXT: retq %b = insertelement <16 x i32> undef, i32 %a, i32 0 %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer @@ -15,9 +13,7 @@ define <16 x i32> @_inreg16xi32(i32 %a) { define <8 x i64> @_inreg8xi64(i64 %a) { ; CHECK-LABEL: _inreg8xi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovq %rdi, %xmm0 -; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 -; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 ; CHECK-NEXT: retq %b = insertelement <8 x i64> undef, i64 %a, i32 0 %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer @@ -27,9 +23,7 @@ define <8 x i64> @_inreg8xi64(i64 %a) { define <16 x float> @_inreg16xfloat(float %a) { ; CHECK-LABEL: _inreg16xfloat: ; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: XMM0 XMM0 ZMM0 -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 -; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 ; CHECK-NEXT: retq %b = insertelement <16 x float> undef, float %a, i32 0 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer @@ -39,9 +33,7 @@ define <16 x float> @_inreg16xfloat(float %a) { define <8 x double> @_inreg8xdouble(double %a) { ; CHECK-LABEL: _inreg8xdouble: ; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: XMM0 XMM0 ZMM0 -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 -; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 ; CHECK-NEXT: retq %b = insertelement <8 x double> undef, double %a, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer @@ -51,8 +43,7 @@ define <8 x double> @_inreg8xdouble(double %a) { define <16 x i32> @_xmm16xi32(<16 x i32> %a) { ; CHECK-LABEL: _xmm16xi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 -; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 ; CHECK-NEXT: retq %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer ret <16 x i32> %b @@ -61,8 +52,7 @@ define <16 x i32> @_xmm16xi32(<16 x i32> %a) { define <16 x float> @_xmm16xfloat(<16 x float> %a) { ; CHECK-LABEL: _xmm16xfloat: ; CHECK: ## BB#0: -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 -; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 ; CHECK-NEXT: retq %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer ret <16 x float> %b diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll index 8ff0263..c71e60e 100644 --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -312,10 +312,7 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16 define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind { ; CHECK-LABEL: test24: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovq (%rdi), %xmm2 -; CHECK-NEXT: vpbroadcastq %xmm2, %ymm2 -; CHECK-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -330,10 +327,7 @@ define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind { define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind { ; CHECK-LABEL: test25: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovd (%rdi), %xmm2 -; CHECK-NEXT: vpbroadcastd %xmm2, %ymm2 -; CHECK-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 -; CHECK-NEXT: vpcmpled %zmm2, %zmm0, %k1 +; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1 ; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -348,11 +342,8 @@ define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind { ; CHECK-LABEL: test26: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovd (%rdi), %xmm3 -; CHECK-NEXT: vpbroadcastd %xmm3, %ymm3 -; CHECK-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 -; CHECK-NEXT: vpcmpgtd %zmm3, %zmm0, %k1 -; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1 {%k1} +; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1 +; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1} ; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -369,11 +360,8 @@ define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32 define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind { ; CHECK-LABEL: test27: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovq (%rdi), %xmm3 -; CHECK-NEXT: vpbroadcastq %xmm3, %ymm3 -; CHECK-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 -; CHECK-NEXT: vpcmpleq %zmm3, %zmm0, %k1 -; CHECK-NEXT: vpcmpleq %zmm1, %zmm2, %k1 {%k1} +; CHECK-NEXT: vpcmpleq %zmm1, %zmm2, %k1 +; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll index d9a69c7..8f87c7c 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -6,8 +6,7 @@ target triple = "x86_64-unknown-unknown" define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00000000: ; ALL: # BB#0: -; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -729,8 +728,7 @@ define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) { define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00000000: ; ALL: # BB#0: -; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle -- 2.7.4