From e44f4a8a54141d5f527ed8ee05362cc98031d723 Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Thu, 30 Jan 2020 03:57:50 +0000 Subject: [PATCH] [LoopFusion] Move instructions from FC1.GuardBlock to FC0.GuardBlock and from FC0.ExitBlock to FC1.ExitBlock when proven safe. Summary: Currently LoopFusion give up when the second loop nest guard block or the first loop nest exit block is not empty. For example: if (0 < N) { for (int i = 0; i < N; ++i) {} x+=1; } y+=1; if (0 < N) { for (int i = 0; i < N; ++i) {} } The above example should be safe to fuse. This PR moves instructions in FC1 guard block (e.g. y+=1;) to FC0 guard block, or instructions in FC0 exit block (e.g. x+=1;) to FC1 exit block, which then LoopFusion is able to fuse them. Reviewer: kbarton, jdoerfert, Meinersbur, dmgreen, fhahn, hfinkel, bmahjour, etiotto Reviewed By: jdoerfert Subscribers: hiraditya, llvm-commits Tag: LLVM Differential Revision: https://reviews.llvm.org/D73641 --- llvm/lib/Transforms/Scalar/LoopFuse.cpp | 87 +++++++--------- .../Transforms/LoopFusion/diagnostics_missed.ll | 93 +++++++++++++++++ llvm/test/Transforms/LoopFusion/guarded.ll | 113 +++++++++++++++++++++ 3 files changed, 241 insertions(+), 52 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 24597e2..8d591d7 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -91,8 +91,10 @@ STATISTIC( "Loop has a non-empty preheader with instructions that cannot be moved"); STATISTIC(FusionNotBeneficial, "Fusion is not beneficial"); STATISTIC(NonIdenticalGuards, "Candidates have different guards"); -STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block"); -STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block"); +STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with " + "instructions that cannot be moved"); +STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with " + "instructions that cannot be moved"); STATISTIC(NotRotated, "Candidate is not rotated"); enum FusionDependenceAnalysisChoice { @@ -750,25 +752,30 @@ private: continue; } - // The following two checks look for empty blocks in FC0 and FC1. If - // any of these blocks are non-empty, we do not fuse. This is done - // because we currently do not have the safety checks to determine if - // it is safe to move the blocks past other blocks in the loop. Once - // these checks are added, these conditions can be relaxed. - if (FC0->GuardBranch && !isEmptyExitBlock(*FC0)) { - LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty exit " - "block. Not fusing.\n"); - reportLoopFusion(*FC0, *FC1, - NonEmptyExitBlock); - continue; - } + if (FC0->GuardBranch) { + assert(FC1->GuardBranch && "Expecting valid FC1 guard branch"); + + if (!isSafeToMoveBefore(*FC0->ExitBlock, + *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT, + PDT, DI)) { + LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " + "instructions in exit block. Not fusing.\n"); + reportLoopFusion(*FC0, *FC1, + NonEmptyExitBlock); + continue; + } - if (FC1->GuardBranch && !isEmptyGuardBlock(*FC1)) { - LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty guard " - "block. Not fusing.\n"); - reportLoopFusion(*FC0, *FC1, - NonEmptyGuardBlock); - continue; + if (!isSafeToMoveBefore( + *FC1->GuardBranch->getParent(), + *FC0->GuardBranch->getParent()->getTerminator(), DT, PDT, + DI)) { + LLVM_DEBUG(dbgs() + << "Fusion candidate contains unsafe " + "instructions in guard block. Not fusing.\n"); + reportLoopFusion(*FC0, *FC1, + NonEmptyGuardBlock); + continue; + } } // Check the dependencies across the loops and do not fuse if it would @@ -1079,38 +1086,6 @@ private: return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader); } - /// Check that the guard for \p FC *only* contains the cmp/branch for the - /// guard. - /// Once we are able to handle intervening code, any code in the guard block - /// for FC1 will need to be treated as intervening code and checked whether - /// it can safely move around the loops. - bool isEmptyGuardBlock(const FusionCandidate &FC) const { - assert(FC.GuardBranch && "Expecting a fusion candidate with guard branch."); - if (auto *CmpInst = dyn_cast(FC.GuardBranch->getCondition())) { - auto *GuardBlock = FC.GuardBranch->getParent(); - // If the generation of the cmp value is in GuardBlock, then the size of - // the guard block should be 2 (cmp + branch). If the generation of the - // cmp value is in a different block, then the size of the guard block - // should only be 1. - if (CmpInst->getParent() == GuardBlock) - return GuardBlock->size() == 2; - else - return GuardBlock->size() == 1; - } - - return false; - } - - bool isEmptyPreheader(const FusionCandidate &FC) const { - assert(FC.Preheader && "Expecting a valid preheader"); - return FC.Preheader->size() == 1; - } - - bool isEmptyExitBlock(const FusionCandidate &FC) const { - assert(FC.ExitBlock && "Expecting a valid exit block"); - return FC.ExitBlock->size() == 1; - } - /// Simplify the condition of the latch branch of \p FC to true, when both of /// its successors are the same. void simplifyLatchBranch(const FusionCandidate &FC) const { @@ -1390,6 +1365,14 @@ private: BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); + // Move instructions from the exit block of FC0 to the beginning of the exit + // block of FC1. + moveInstructionsToTheBeginning(*FC0.ExitBlock, *FC1.ExitBlock, DT, PDT, DI); + + // Move instructions from the guard block of FC1 to the end of the guard + // block of FC0. + moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI); + assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent"); SmallVector TreeUpdates; diff --git a/llvm/test/Transforms/LoopFusion/diagnostics_missed.ll b/llvm/test/Transforms/LoopFusion/diagnostics_missed.ll index fe88f3d..7bfc0e9 100644 --- a/llvm/test/Transforms/LoopFusion/diagnostics_missed.ll +++ b/llvm/test/Transforms/LoopFusion/diagnostics_missed.ll @@ -213,6 +213,93 @@ for.second: for.end: ret void } + +; CHECK: remark: diagnostics_missed.c:67:3: [unsafe_exitblock]: for.first.preheader and for.second.preheader: Candidate has a non-empty exit block with instructions that cannot be moved +define void @unsafe_exitblock(i32* noalias %A, i32* noalias %B, i64 %N) { +for.first.guard: + %cmp3 = icmp slt i64 0, %N + br i1 %cmp3, label %for.first.preheader, label %for.second.guard + +for.first.preheader: + br label %for.first, !dbg !83 + +for.first: + %i.04 = phi i64 [ %inc, %for.first ], [ 0, %for.first.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.04 + store i32 0, i32* %arrayidx, align 4 + %inc = add nsw i64 %i.04, 1 + %cmp = icmp slt i64 %inc, %N + br i1 %cmp, label %for.first, label %for.first.exit + +for.first.exit: + call void @bar() + br label %for.second.guard + +for.second.guard: + %cmp21 = icmp slt i64 0, %N + br i1 %cmp21, label %for.second.preheader, label %for.end + +for.second.preheader: + br label %for.second + +for.second: + %j.02 = phi i64 [ %inc6, %for.second ], [ 0, %for.second.preheader ] + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %j.02 + store i32 0, i32* %arrayidx4, align 4 + %inc6 = add nsw i64 %j.02, 1 + %cmp2 = icmp slt i64 %inc6, %N + br i1 %cmp2, label %for.second, label %for.second.exit + +for.second.exit: + br label %for.end + +for.end: + ret void +} + +; CHECK: remark: diagnostics_missed.c:72:3: [unsafe_guardblock]: for.first.preheader and for.second.preheader: Candidate has a non-empty guard block with instructions that cannot be moved +define void @unsafe_guardblock(i32* noalias %A, i32* noalias %B, i64 %N) { +for.first.guard: + %cmp3 = icmp slt i64 0, %N + br i1 %cmp3, label %for.first.preheader, label %for.second.guard + +for.first.preheader: + br label %for.first, !dbg !86 + +for.first: + %i.04 = phi i64 [ %inc, %for.first ], [ 0, %for.first.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.04 + store i32 0, i32* %arrayidx, align 4 + %inc = add nsw i64 %i.04, 1 + %cmp = icmp slt i64 %inc, %N + br i1 %cmp, label %for.first, label %for.first.exit + +for.first.exit: + br label %for.second.guard + +for.second.guard: + call void @bar() + %cmp21 = icmp slt i64 0, %N + br i1 %cmp21, label %for.second.preheader, label %for.end + +for.second.preheader: + br label %for.second + +for.second: + %j.02 = phi i64 [ %inc6, %for.second ], [ 0, %for.second.preheader ] + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %j.02 + store i32 0, i32* %arrayidx4, align 4 + %inc6 = add nsw i64 %j.02, 1 + %cmp2 = icmp slt i64 %inc6, %N + br i1 %cmp2, label %for.second, label %for.second.exit + +for.second.exit: + br label %for.end + +for.end: + ret void +} + declare void @bar() attributes #0 = { nounwind readnone speculatable willreturn } @@ -301,3 +388,9 @@ attributes #0 = { nounwind readnone speculatable willreturn } !78 = !{} !79 = distinct !DILexicalBlock(scope: !77, file: !3, line: 3, column: 5) !80 = !DILocation(line: 62, column: 3, scope: !79) +!81 = distinct !DISubprogram(name: "unsafe_exitblock", scope: !3, file: !3, line: 65, type: !15, scopeLine: 60, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !78) +!82 = distinct !DILexicalBlock(scope: !81, file: !3, line: 3, column: 5) +!83 = !DILocation(line: 67, column: 3, scope: !82) +!84 = distinct !DISubprogram(name: "unsafe_guardblock", scope: !3, file: !3, line: 70, type: !15, scopeLine: 60, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !78) +!85 = distinct !DILexicalBlock(scope: !84, file: !3, line: 3, column: 5) +!86 = !DILocation(line: 72, column: 3, scope: !85) diff --git a/llvm/test/Transforms/LoopFusion/guarded.ll b/llvm/test/Transforms/LoopFusion/guarded.ll index 9d64a39..e3fc58c 100644 --- a/llvm/test/Transforms/LoopFusion/guarded.ll +++ b/llvm/test/Transforms/LoopFusion/guarded.ll @@ -119,3 +119,116 @@ for.second.exit: for.end: ret void } + +; Test that `%add` is moved in for.second.exit, and the two loops for.first +; and for.second are fused. + +; CHECK: void @moveinsts_exitblock +; CHECK-LABEL: for.first.guard: +; CHECK: br i1 %cmp.guard, label %for.first.preheader, label %for.end +; CHECK-LABEL: for.first.preheader: +; CHECK-NEXT: br label %for.first +; CHECK-LABEL: for.first: +; CHECK: br i1 %cmp.j, label %for.first, label %for.second.exit +; CHECK-LABEL: for.second.exit: +; CHECK-NEXT: %add = add nsw i32 %x, 1 +; CHECK-NEXT: br label %for.end +; CHECK-LABEL: for.end: +; CHECK-NEXT: ret void +define void @moveinsts_exitblock(i32* noalias %A, i32* noalias %B, i64 %N, i32 %x) { +for.first.guard: + %cmp.guard = icmp slt i64 0, %N + br i1 %cmp.guard, label %for.first.preheader, label %for.second.guard + +for.first.preheader: + br label %for.first + +for.first: + %i.04 = phi i64 [ %inc, %for.first ], [ 0, %for.first.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.04 + store i32 0, i32* %arrayidx, align 4 + %inc = add nsw i64 %i.04, 1 + %cmp = icmp slt i64 %inc, %N + br i1 %cmp, label %for.first, label %for.first.exit + +for.first.exit: + %add = add nsw i32 %x, 1 + br label %for.second.guard + +for.second.guard: + br i1 %cmp.guard, label %for.second.preheader, label %for.end + +for.second.preheader: + br label %for.second + +for.second: + %j.02 = phi i64 [ %inc6, %for.second ], [ 0, %for.second.preheader ] + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %j.02 + store i32 0, i32* %arrayidx4, align 4 + %inc6 = add nsw i64 %j.02, 1 + %cmp.j = icmp slt i64 %inc6, %N + br i1 %cmp.j, label %for.second, label %for.second.exit + +for.second.exit: + br label %for.end + +for.end: + ret void +} + +; Test that `%add` is moved in for.first.guard, and the two loops for.first +; and for.second are fused. + +; CHECK: void @moveinsts_guardblock +; CHECK-LABEL: for.first.guard: +; CHECK-NEXT: %cmp.guard = icmp slt i64 0, %N +; CHECK-NEXT: %add = add nsw i32 %x, 1 +; CHECK: br i1 %cmp.guard, label %for.first.preheader, label %for.end +; CHECK-LABEL: for.first.preheader: +; CHECK-NEXT: br label %for.first +; CHECK-LABEL: for.first: +; CHECK: br i1 %cmp.j, label %for.first, label %for.second.exit +; CHECK-LABEL: for.second.exit: +; CHECK-NEXT: br label %for.end +; CHECK-LABEL: for.end: +; CHECK-NEXT: ret void +define void @moveinsts_guardblock(i32* noalias %A, i32* noalias %B, i64 %N, i32 %x) { +for.first.guard: + %cmp.guard = icmp slt i64 0, %N + br i1 %cmp.guard, label %for.first.preheader, label %for.second.guard + +for.first.preheader: + br label %for.first + +for.first: + %i.04 = phi i64 [ %inc, %for.first ], [ 0, %for.first.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.04 + store i32 0, i32* %arrayidx, align 4 + %inc = add nsw i64 %i.04, 1 + %cmp = icmp slt i64 %inc, %N + br i1 %cmp, label %for.first, label %for.first.exit + +for.first.exit: + br label %for.second.guard + +for.second.guard: + %add = add nsw i32 %x, 1 + br i1 %cmp.guard, label %for.second.preheader, label %for.end + +for.second.preheader: + br label %for.second + +for.second: + %j.02 = phi i64 [ %inc6, %for.second ], [ 0, %for.second.preheader ] + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %j.02 + store i32 0, i32* %arrayidx4, align 4 + %inc6 = add nsw i64 %j.02, 1 + %cmp.j = icmp slt i64 %inc6, %N + br i1 %cmp.j, label %for.second, label %for.second.exit + +for.second.exit: + br label %for.end + +for.end: + ret void +} -- 2.7.4