From 1bcb6a3da231ee3bcf8513880599b5d054f590a4 Mon Sep 17 00:00:00 2001 From: Guozhi Wei Date: Wed, 21 Jun 2023 18:54:31 +0000 Subject: [PATCH] [MBP] Enable duplicating return block to remove jump to return Sometimes LLVM generates branch to return instruction, like PR63227. It is because in function MachineBlockPlacement::canTailDuplicateUnplacedPreds we avoid duplicating a BB into another already placed BB to prevent destroying computed layout. But if the successor BB is a return block, duplicating it will only reduce taken branches without hurt to any other branches. Differential Revision: https://reviews.llvm.org/D153093 --- llvm/lib/CodeGen/MachineBlockPlacement.cpp | 2 +- .../CodeGen/AArch64/aarch64-matrix-umull-smull.ll | 3 +- llvm/test/CodeGen/Thumb/branch-to-return.ll | 71 ++++++++++++++++++++++ .../Thumb2/LowOverheadLoops/mve-float-loops.ll | 6 +- llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll | 53 +++++++++------- 5 files changed, 109 insertions(+), 26 deletions(-) create mode 100644 llvm/test/CodeGen/Thumb/branch-to-return.ll diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index eec6022..912e9ec 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -1159,7 +1159,7 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds( // tail-duplicated into. // Skip any blocks that are already placed or not in this loop. if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) - || BlockToChain[Pred] == &Chain) + || (BlockToChain[Pred] == &Chain && !Succ->succ_empty())) continue; if (!TailDup.canTailDuplicate(Succ, Pred)) { if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors)) diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll index 8dd4da1..e4c776b 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -425,7 +425,8 @@ define i16 @red_mla_dup_ext_u8_s8_s16(i8* noalias nocapture noundef readonly %A, ; CHECK-NEXT: b .LBB5_7 ; CHECK-NEXT: .LBB5_3: ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB5_9 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret ; CHECK-NEXT: .LBB5_4: // %vector.ph ; CHECK-NEXT: and x11, x10, #0xfffffff0 ; CHECK-NEXT: add x8, x0, #8 diff --git a/llvm/test/CodeGen/Thumb/branch-to-return.ll b/llvm/test/CodeGen/Thumb/branch-to-return.ll new file mode 100644 index 0000000..5bfccc0 --- /dev/null +++ b/llvm/test/CodeGen/Thumb/branch-to-return.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv7 %s -o - | FileCheck %s + +; Test the branch to return in BB4 is converted to return. + +define i32 @foo(i32* %x, i32 %n) { +; CHECK-LABEL: foo: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r1, #1 +; CHECK-NEXT: blt .LBB0_4 +; CHECK-NEXT: @ %bb.1: @ %for.body.preheader +; CHECK-NEXT: bic r3, r1, #3 +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: cmp r1, #4 +; CHECK-NEXT: bhs .LBB0_3 +; CHECK-NEXT: @ %bb.2: +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: b .LBB0_6 +; CHECK-NEXT: .LBB0_3: @ %middle.block +; CHECK-NEXT: cmp r1, r3 +; CHECK-NEXT: bne .LBB0_5 +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .LBB0_5: +; CHECK-NEXT: ldr.w r0, [r12] +; CHECK-NEXT: .LBB0_6: @ %for.body.preheader1 +; CHECK-NEXT: subs r3, r1, r3 +; CHECK-NEXT: mvn r2, #12 +; CHECK-NEXT: and.w r1, r2, r1, lsl #2 +; CHECK-NEXT: add r1, r12 +; CHECK-NEXT: .LBB0_7: @ %for.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr r2, [r1], #4 +; CHECK-NEXT: subs r3, #1 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: bne .LBB0_7 +; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %n.vec = and i32 %n, -4 + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %min.iters.check = icmp ult i32 %n, 4 + br i1 %min.iters.check, label %for.body.preheader1, label %middle.block + +middle.block: + %x3 = load i32, i32* %x, align 4 + %cmp.n = icmp eq i32 %n.vec, %n + br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 + +for.body.preheader1: ; preds = %middle.block, %for.body.preheader + %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %x3, %middle.block ] + br label %for.body + +for.body: ; preds = %for.body.preheader1, %for.body + %i.08 = phi i32 [ %inc, %for.body ], [ %n.vec, %for.body.preheader1 ] + %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] + %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 + %v5 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %v5, %r.07 + %inc = add nuw nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %middle.block, %entry + %r.0.lcssa = phi i32 [ 0, %entry ], [ 0, %middle.block ], [ %add, %for.body ] + ret i32 %r.0.lcssa +} diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll index 23eb590..cc6d092 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -1417,7 +1417,7 @@ define arm_aapcs_vfpcc float @half_half_mac(ptr nocapture readonly %a, ptr nocap ; CHECK-NEXT: b .LBB9_6 ; CHECK-NEXT: .LBB9_3: ; CHECK-NEXT: vldr s0, .LCPI9_0 -; CHECK-NEXT: b .LBB9_9 +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .LBB9_4: @ %for.body.preheader.new ; CHECK-NEXT: bic r2, r2, #3 ; CHECK-NEXT: movs r3, #1 @@ -1572,7 +1572,7 @@ define arm_aapcs_vfpcc float @half_half_acc(ptr nocapture readonly %a, ptr nocap ; CHECK-NEXT: b .LBB10_6 ; CHECK-NEXT: .LBB10_3: ; CHECK-NEXT: vldr s0, .LCPI10_0 -; CHECK-NEXT: b .LBB10_9 +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .LBB10_4: @ %for.body.preheader.new ; CHECK-NEXT: bic r2, r2, #3 ; CHECK-NEXT: movs r3, #1 @@ -1727,7 +1727,7 @@ define arm_aapcs_vfpcc float @half_short_mac(ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: b .LBB11_6 ; CHECK-NEXT: .LBB11_3: ; CHECK-NEXT: vldr s0, .LCPI11_0 -; CHECK-NEXT: b .LBB11_9 +; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .LBB11_4: @ %for.body.preheader.new ; CHECK-NEXT: bic r2, r2, #3 ; CHECK-NEXT: movs r3, #1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll index f5adcf0..6ab1a93 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll @@ -18,7 +18,7 @@ define i32 @add_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: b .LBB0_7 ; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: b .LBB0_9 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB0_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -43,7 +43,7 @@ define i32 @add_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: ldr r1, [r2], #4 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: le lr, .LBB0_8 -; CHECK-NEXT: .LBB0_9: @ %for.cond.cleanup +; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp sgt i32 %n, 0 @@ -201,7 +201,8 @@ define i32 @and_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: b .LBB2_7 ; CHECK-NEXT: .LBB2_3: ; CHECK-NEXT: mov.w r2, #-1 -; CHECK-NEXT: b .LBB2_9 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .LBB2_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -297,7 +298,8 @@ define i32 @or_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: b .LBB3_7 ; CHECK-NEXT: .LBB3_3: ; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: b .LBB3_9 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .LBB3_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -393,7 +395,8 @@ define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: b .LBB4_7 ; CHECK-NEXT: .LBB4_3: ; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: b .LBB4_9 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .LBB4_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -489,7 +492,8 @@ define float @fadd_f32(float* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: b .LBB5_7 ; CHECK-NEXT: .LBB5_3: ; CHECK-NEXT: vldr s0, .LCPI5_0 -; CHECK-NEXT: b .LBB5_9 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB5_4: @ %vector.ph ; CHECK-NEXT: bic r2, r1, #3 ; CHECK-NEXT: movs r3, #1 @@ -587,7 +591,8 @@ define float @fmul_f32(float* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: b .LBB6_7 ; CHECK-NEXT: .LBB6_3: ; CHECK-NEXT: vmov.f32 s0, #1.000000e+00 -; CHECK-NEXT: b .LBB6_9 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB6_4: @ %vector.ph ; CHECK-NEXT: bic r2, r1, #3 ; CHECK-NEXT: movs r3, #1 @@ -681,7 +686,8 @@ define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: b .LBB7_7 ; CHECK-NEXT: .LBB7_3: ; CHECK-NEXT: mvn r2, #-2147483648 -; CHECK-NEXT: b .LBB7_9 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB7_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -778,7 +784,7 @@ define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: b .LBB8_7 ; CHECK-NEXT: .LBB8_3: ; CHECK-NEXT: mvn r0, #-2147483648 -; CHECK-NEXT: b .LBB8_9 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB8_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -804,7 +810,7 @@ define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: csel r0, r0, r1, lt ; CHECK-NEXT: le lr, .LBB8_8 -; CHECK-NEXT: .LBB8_9: @ %for.cond.cleanup +; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp sgt i32 %n, 0 @@ -873,7 +879,8 @@ define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: b .LBB9_7 ; CHECK-NEXT: .LBB9_3: ; CHECK-NEXT: mov.w r2, #-2147483648 -; CHECK-NEXT: b .LBB9_9 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB9_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -970,7 +977,7 @@ define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: b .LBB10_7 ; CHECK-NEXT: .LBB10_3: ; CHECK-NEXT: mov.w r0, #-2147483648 -; CHECK-NEXT: b .LBB10_9 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB10_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -996,7 +1003,7 @@ define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: csel r0, r0, r1, gt ; CHECK-NEXT: le lr, .LBB10_8 -; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup +; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp sgt i32 %n, 0 @@ -1065,7 +1072,8 @@ define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: b .LBB11_7 ; CHECK-NEXT: .LBB11_3: ; CHECK-NEXT: mov.w r2, #-1 -; CHECK-NEXT: b .LBB11_9 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB11_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -1162,7 +1170,7 @@ define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: b .LBB12_7 ; CHECK-NEXT: .LBB12_3: ; CHECK-NEXT: mov.w r0, #-1 -; CHECK-NEXT: b .LBB12_9 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB12_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -1188,7 +1196,7 @@ define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: csel r0, r0, r1, hi ; CHECK-NEXT: le lr, .LBB12_8 -; CHECK-NEXT: .LBB12_9: @ %for.cond.cleanup +; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp sgt i32 %n, 0 @@ -1257,7 +1265,8 @@ define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: b .LBB13_7 ; CHECK-NEXT: .LBB13_3: ; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: b .LBB13_9 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB13_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -1354,7 +1363,7 @@ define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: b .LBB14_7 ; CHECK-NEXT: .LBB14_3: ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: b .LBB14_9 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB14_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -1380,7 +1389,7 @@ define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: csel r0, r0, r1, hi ; CHECK-NEXT: le lr, .LBB14_8 -; CHECK-NEXT: .LBB14_9: @ %for.cond.cleanup +; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp sgt i32 %n, 0 @@ -1449,7 +1458,8 @@ define float @fmin_f32(float* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: b .LBB15_7 ; CHECK-NEXT: .LBB15_3: ; CHECK-NEXT: vldr s0, .LCPI15_0 -; CHECK-NEXT: b .LBB15_9 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB15_4: @ %vector.ph ; CHECK-NEXT: bic r2, r1, #3 ; CHECK-NEXT: movs r3, #1 @@ -1552,7 +1562,8 @@ define float @fmax_f32(float* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: b .LBB16_7 ; CHECK-NEXT: .LBB16_3: ; CHECK-NEXT: vldr s0, .LCPI16_0 -; CHECK-NEXT: b .LBB16_9 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB16_4: @ %vector.ph ; CHECK-NEXT: bic r2, r1, #3 ; CHECK-NEXT: movs r3, #1 -- 2.7.4