From f634096f6b7b5d0a795d5eb148e9f633ad980a72 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 7 Feb 2023 14:12:51 +0000 Subject: [PATCH] [ARM] Perform lane interleaving from reductions. We have a pass for MVE to perform lane interleaving to make use of top/bottom instructions, that adds shuffles before extends and after truncates. This extends it to also start from add reductions, where the order of lanes does not matter so the shuffle is not needed. We need to be careful about not breaking the form of existing reductions, but otherwise can save some instructions and awkward extends. Differential Revision: https://reviews.llvm.org/D143396 --- llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp | 52 ++++++++++++++++--- .../CodeGen/Thumb2/mve-laneinterleaving-reduct.ll | 60 ++++++++-------------- 2 files changed, 64 insertions(+), 48 deletions(-) diff --git a/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp index 34f9ea1..5ac79cb 100644 --- a/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp +++ b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp @@ -154,7 +154,6 @@ static bool isProfitableToInterleave(SmallSetVector &Exts, static bool tryInterleave(Instruction *Start, SmallPtrSetImpl &Visited) { LLVM_DEBUG(dbgs() << "tryInterleave from " << *Start << "\n"); - auto *VT = cast(Start->getType()); if (!isa(Start->getOperand(0))) return false; @@ -165,6 +164,7 @@ static bool tryInterleave(Instruction *Start, Worklist.push_back(cast(Start->getOperand(0))); SmallSetVector Truncs; + SmallSetVector Reducts; SmallSetVector Exts; SmallSetVector OtherLeafs; SmallSetVector Ops; @@ -198,6 +198,13 @@ static bool tryInterleave(Instruction *Start, if (!II) return false; + if (II->getIntrinsicID() == Intrinsic::vector_reduce_add) { + if (!Reducts.insert(I)) + continue; + Visited.insert(I); + break; + } + switch (II->getIntrinsicID()) { case Intrinsic::abs: case Intrinsic::smin: @@ -267,21 +274,32 @@ static bool tryInterleave(Instruction *Start, return false; LLVM_DEBUG({ - dbgs() << "Found group:\n Exts:"; + dbgs() << "Found group:\n Exts:\n"; for (auto *I : Exts) dbgs() << " " << *I << "\n"; - dbgs() << " Ops:"; + dbgs() << " Ops:\n"; for (auto *I : Ops) dbgs() << " " << *I << "\n"; - dbgs() << " OtherLeafs:"; + dbgs() << " OtherLeafs:\n"; for (auto *I : OtherLeafs) dbgs() << " " << *I->get() << " of " << *I->getUser() << "\n"; - dbgs() << "Truncs:"; + dbgs() << " Truncs:\n"; for (auto *I : Truncs) dbgs() << " " << *I << "\n"; + dbgs() << " Reducts:\n"; + for (auto *I : Reducts) + dbgs() << " " << *I << "\n"; }); - assert(!Truncs.empty() && "Expected some truncs"); + assert((!Truncs.empty() || !Reducts.empty()) && + "Expected some truncs or reductions"); + if (Truncs.empty() && Exts.empty()) + return false; + + auto *VT = !Truncs.empty() + ? cast(Truncs[0]->getType()) + : cast(Exts[0]->getOperand(0)->getType()); + LLVM_DEBUG(dbgs() << "Using VT:" << *VT << "\n"); // Check types unsigned NumElts = VT->getNumElements(); @@ -311,6 +329,14 @@ static bool tryInterleave(Instruction *Start, // Check that it looks beneficial if (!isProfitableToInterleave(Exts, Truncs)) return false; + if (!Reducts.empty() && (Ops.empty() || all_of(Ops, [](Instruction *I) { + return I->getOpcode() == Instruction::Mul || + I->getOpcode() == Instruction::Select || + I->getOpcode() == Instruction::ICmp; + }))) { + LLVM_DEBUG(dbgs() << "Reduction does not look profitable\n"); + return false; + } // Create new shuffles around the extends / truncs / other leaves. IRBuilder<> Builder(Start); @@ -367,6 +393,14 @@ static bool tryInterleave(Instruction *Start, return true; } +// Add reductions are fairly common and associative, meaning we can start the +// interleaving from them and don't need to emit a shuffle. +static bool isAddReduction(Instruction &I) { + if (auto *II = dyn_cast(&I)) + return II->getIntrinsicID() == Intrinsic::vector_reduce_add; + return false; +} + bool MVELaneInterleaving::runOnFunction(Function &F) { if (!EnableInterleave) return false; @@ -380,8 +414,10 @@ bool MVELaneInterleaving::runOnFunction(Function &F) { SmallPtrSet Visited; for (Instruction &I : reverse(instructions(F))) { - if (I.getType()->isVectorTy() && - (isa(I) || isa(I)) && !Visited.count(&I)) + if (((I.getType()->isVectorTy() && + (isa(I) || isa(I))) || + isAddReduction(I)) && + !Visited.count(&I)) Changed |= tryInterleave(&I, Visited); } diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll index b286c42..d13a345 100644 --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll @@ -4,23 +4,12 @@ define arm_aapcs_vfpcc i16 @reduce_v16i16_shift_mul(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: reduce_v16i16_shift_mul: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: mov r1, sp -; CHECK-NEXT: vstrw.32 q1, [r0] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vldrb.u16 q0, [r0, #8] -; CHECK-NEXT: vldrb.u16 q1, [r1, #8] -; CHECK-NEXT: vldrb.u16 q2, [r1] -; CHECK-NEXT: vmul.i16 q0, q1, q0 -; CHECK-NEXT: vldrb.u16 q1, [r0] +; CHECK-NEXT: vmullt.u8 q2, q0, q1 +; CHECK-NEXT: vmullb.u8 q0, q0, q1 +; CHECK-NEXT: vshr.s16 q2, q2, #14 ; CHECK-NEXT: vshr.s16 q0, q0, #14 -; CHECK-NEXT: vmul.i16 q1, q2, q1 -; CHECK-NEXT: vaddv.u16 r0, q0 -; CHECK-NEXT: vshr.s16 q1, q1, #14 -; CHECK-NEXT: vaddva.u16 r0, q1 -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: vaddv.u16 r0, q2 +; CHECK-NEXT: vaddva.u16 r0, q0 ; CHECK-NEXT: bx lr entry: %s0s = zext <16 x i8> %s0 to <16 x i16> @@ -50,23 +39,16 @@ entry: define arm_aapcs_vfpcc i16 @reduce_v16i16_shift_sub(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: reduce_v16i16_shift_sub: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: mov r1, sp -; CHECK-NEXT: vstrw.32 q1, [r0] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vldrb.u16 q0, [r0, #8] -; CHECK-NEXT: vldrb.u16 q1, [r1, #8] -; CHECK-NEXT: vldrb.u16 q2, [r1] -; CHECK-NEXT: vsub.i16 q0, q1, q0 -; CHECK-NEXT: vldrb.u16 q1, [r0] +; CHECK-NEXT: vmovlt.u8 q2, q1 +; CHECK-NEXT: vmovlt.u8 q3, q0 +; CHECK-NEXT: vsub.i16 q2, q3, q2 +; CHECK-NEXT: vmovlb.u8 q1, q1 +; CHECK-NEXT: vmovlb.u8 q0, q0 +; CHECK-NEXT: vshr.s16 q2, q2, #14 +; CHECK-NEXT: vsub.i16 q0, q0, q1 +; CHECK-NEXT: vaddv.u16 r0, q2 ; CHECK-NEXT: vshr.s16 q0, q0, #14 -; CHECK-NEXT: vsub.i16 q1, q2, q1 -; CHECK-NEXT: vaddv.u16 r0, q0 -; CHECK-NEXT: vshr.s16 q1, q1, #14 -; CHECK-NEXT: vaddva.u16 r0, q1 -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: vaddva.u16 r0, q0 ; CHECK-NEXT: bx lr entry: %s0s = zext <16 x i8> %s0 to <16 x i16> @@ -190,17 +172,15 @@ define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef ; CHECK-NEXT: .LBB4_8: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrh.s32 q2, [r5], #16 -; CHECK-NEXT: vldrh.s32 q1, [r4], #16 +; CHECK-NEXT: vldrh.u16 q1, [r4], #16 +; CHECK-NEXT: vldrh.u16 q2, [r5], #16 ; CHECK-NEXT: rsb.w r1, r12, #0 -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r4, #-8] -; CHECK-NEXT: vldrh.s32 q3, [r5, #-8] +; CHECK-NEXT: vmullb.s16 q3, q2, q1 +; CHECK-NEXT: vmullt.s16 q1, q2, q1 +; CHECK-NEXT: vshl.s32 q3, r1 ; CHECK-NEXT: vshl.s32 q1, r1 +; CHECK-NEXT: vaddva.u32 r6, q3 ; CHECK-NEXT: vaddva.u32 r6, q1 -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vshl.s32 q2, r1 -; CHECK-NEXT: vaddva.u32 r6, q2 ; CHECK-NEXT: le lr, .LBB4_8 ; CHECK-NEXT: @ %bb.9: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -- 2.7.4