From 28b41237e6b296bf777d2f0c13c48031525fcdc4 Mon Sep 17 00:00:00 2001 From: David Green Date: Sun, 10 Jul 2022 17:24:37 +0100 Subject: [PATCH] [InterleaveAccessPass] Handle multi-use binop shuffles D89489 added some logic to the interleaved access pass to attempt to undo the folding of shuffles into binops, that instcombine performs. If early-cse is run too, the binops may be commoned into a single operation with multiple shuffle uses. It is still profitable reverse the transform though, so long as all the uses are shuffles. Differential Revision: https://reviews.llvm.org/D129419 --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 9 +++-- llvm/test/CodeGen/AArch64/vldn_shuffle.ll | 63 +++++++----------------------- llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll | 44 ++++----------------- 3 files changed, 27 insertions(+), 89 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index b3f38a3..55f3ad7 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -310,10 +310,11 @@ bool InterleavedAccess::lowerInterleavedLoad( Extracts.push_back(Extract); continue; } - auto *BI = dyn_cast(User); - if (BI && BI->hasOneUse()) { - if (auto *SVI = dyn_cast(*BI->user_begin())) { - BinOpShuffles.insert(SVI); + if (auto *BI = dyn_cast(User)) { + if (all_of(BI->users(), + [](auto *U) { return isa(U); })) { + for (auto *SVI : BI->users()) + BinOpShuffles.insert(cast(SVI)); continue; } } diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll index 7758781..d72dcd5 100644 --- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll +++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll @@ -188,13 +188,10 @@ define void @vld2_multiuse(float* nocapture readonly %pSrc, float* noalias nocap ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB4_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldp q1, q0, [x0], #32 -; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s -; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s -; CHECK-NEXT: uzp1 v2.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp2 v0.4s, v1.4s, v0.4s -; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s -; CHECK-NEXT: str q0, [x1, x8] +; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x0], #32 +; CHECK-NEXT: fmul v2.4s, v0.4s, v0.4s +; CHECK-NEXT: fmla v2.4s, v1.4s, v1.4s +; CHECK-NEXT: str q2, [x1, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 ; CHECK-NEXT: b.ne .LBB4_1 @@ -230,25 +227,11 @@ define void @vld3_multiuse(float* nocapture readonly %pSrc, float* noalias nocap ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB5_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s -; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s -; CHECK-NEXT: ldr q3, [x0, #32] -; CHECK-NEXT: add x0, x0, #48 -; CHECK-NEXT: mov v2.16b, v0.16b -; CHECK-NEXT: mov v2.s[1], v0.s[3] -; CHECK-NEXT: rev64 v4.4s, v1.4s -; CHECK-NEXT: fmul v3.4s, v3.4s, v3.4s -; CHECK-NEXT: mov v2.s[2], v1.s[2] -; CHECK-NEXT: mov v4.s[0], v0.s[1] -; CHECK-NEXT: mov v1.s[0], v0.s[2] -; CHECK-NEXT: mov v2.s[3], v3.s[1] -; CHECK-NEXT: mov v4.s[3], v3.s[2] -; CHECK-NEXT: mov v1.s[2], v3.s[0] -; CHECK-NEXT: fadd v0.4s, v4.4s, v2.4s -; CHECK-NEXT: mov v1.s[3], v3.s[3] -; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: str q0, [x1, x8] +; CHECK-NEXT: ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48 +; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s +; CHECK-NEXT: fmla v3.4s, v1.4s, v1.4s +; CHECK-NEXT: fmla v3.4s, v2.4s, v2.4s +; CHECK-NEXT: str q3, [x1, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 ; CHECK-NEXT: b.ne .LBB5_1 @@ -286,31 +269,15 @@ define void @vld4_multiuse(float* nocapture readonly %pSrc, float* noalias nocap ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB6_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64 +; CHECK-NEXT: fmul v4.4s, v0.4s, v0.4s ; CHECK-NEXT: add x9, x1, x8 ; CHECK-NEXT: add x8, x8, #32 ; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 -; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s -; CHECK-NEXT: ldp q3, q2, [x0], #64 -; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s -; CHECK-NEXT: fmul v3.4s, v3.4s, v3.4s -; CHECK-NEXT: fmul v2.4s, v2.4s, v2.4s -; CHECK-NEXT: zip1 v4.4s, v1.4s, v0.4s -; CHECK-NEXT: zip2 v5.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp2 v16.4s, v3.4s, v2.4s -; CHECK-NEXT: ext v6.16b, v1.16b, v4.16b, #8 -; CHECK-NEXT: trn2 v7.4s, v3.4s, v2.4s -; CHECK-NEXT: mov v1.s[3], v0.s[2] -; CHECK-NEXT: zip1 v0.4s, v3.4s, v2.4s -; CHECK-NEXT: zip2 v2.4s, v3.4s, v2.4s -; CHECK-NEXT: uzp2 v3.4s, v16.4s, v3.4s -; CHECK-NEXT: mov v7.d[1], v4.d[1] -; CHECK-NEXT: mov v0.d[1], v6.d[1] -; CHECK-NEXT: mov v2.d[1], v1.d[1] -; CHECK-NEXT: mov v3.d[1], v5.d[1] -; CHECK-NEXT: fadd v0.4s, v7.4s, v0.4s -; CHECK-NEXT: fadd v1.4s, v3.4s, v2.4s -; CHECK-NEXT: st2 { v0.4s, v1.4s }, [x9] +; CHECK-NEXT: fmla v4.4s, v1.4s, v1.4s +; CHECK-NEXT: fmul v5.4s, v2.4s, v2.4s +; CHECK-NEXT: fmla v5.4s, v3.4s, v3.4s +; CHECK-NEXT: st2 { v4.4s, v5.4s }, [x9] ; CHECK-NEXT: b.ne .LBB6_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll index 770c056..3dcdca3 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll @@ -286,30 +286,10 @@ define void @arm_cmplx_mag_squared_f16_cse(half* nocapture readonly %pSrc, half* ; CHECK-NEXT: and r5, r2, #7 ; CHECK-NEXT: .LBB2_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q0, [r0], #32 +; CHECK-NEXT: vld20.16 {q0, q1}, [r0] +; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! ; CHECK-NEXT: vmul.f16 q0, q0, q0 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vmul.f16 q2, q2, q2 -; CHECK-NEXT: vmovx.f16 s5, s2 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s3 -; CHECK-NEXT: vins.f16 s5, s6 -; CHECK-NEXT: vmovx.f16 s6, s8 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vmovx.f16 s7, s10 -; CHECK-NEXT: vins.f16 s6, s12 -; CHECK-NEXT: vmovx.f16 s12, s11 -; CHECK-NEXT: vins.f16 s2, s3 -; CHECK-NEXT: vins.f16 s10, s11 -; CHECK-NEXT: vins.f16 s8, s9 -; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vins.f16 s7, s12 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s10 -; CHECK-NEXT: vadd.f16 q0, q1, q0 +; CHECK-NEXT: vfma.f16 q0, q1, q1 ; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: le lr, .LBB2_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block @@ -413,8 +393,7 @@ define void @arm_cmplx_mag_squared_f32_cse(float* nocapture readonly %pSrc, floa ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: beq .LBB3_8 +; CHECK-NEXT: cbz r2, .LBB3_8 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader ; CHECK-NEXT: cmp r2, #4 ; CHECK-NEXT: blo .LBB3_9 @@ -435,19 +414,10 @@ define void @arm_cmplx_mag_squared_f32_cse(float* nocapture readonly %pSrc, floa ; CHECK-NEXT: and r5, r2, #3 ; CHECK-NEXT: .LBB3_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0], #32 +; CHECK-NEXT: vld20.32 {q0, q1}, [r0] +; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! ; CHECK-NEXT: vmul.f32 q0, q0, q0 -; CHECK-NEXT: vmul.f32 q1, q1, q1 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vmov.f32 s5, s7 -; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vadd.f32 q0, q1, q2 +; CHECK-NEXT: vfma.f32 q0, q1, q1 ; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: le lr, .LBB3_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -- 2.7.4