From 222aeb4d51a46c5a81c9e4ccb16d1d19dd21ec95 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 31 May 2021 10:22:37 +0100 Subject: [PATCH] [DSE] Remove stores in the same loop iteration DSE will currently only remove stores in the same block unless they can be guaranteed to be loop invariant. This expands that to any stores that are in the same Loop, at the same loop level. This should still account for where AA/MSSA will not handle aliasing between loops, but allow the dead stores to be removed where they overlap in the same loop iteration. It requires adding loop info to DSE, but that looks fairly harmless. The test case this helps is from code like this, which can come up in certain matrix operations: for(i=..) dst[i] = 0; for(j=..) dst[i] += src[i*n+j]; After LICM, this becomes: for(i=..) dst[i] = 0; sum = 0; for(j=..) sum += src[i*n+j]; dst[i] = sum; The first store is dead, and with this patch is now removed. Differntial Revision: https://reviews.llvm.org/D100464 --- .../lib/Transforms/Scalar/DeadStoreElimination.cpp | 78 ++++++++++++++++------ llvm/test/CodeGen/AMDGPU/opt-pipeline.ll | 4 +- llvm/test/Other/opt-O2-pipeline.ll | 2 +- llvm/test/Other/opt-O3-pipeline-enable-matrix.ll | 2 +- llvm/test/Other/opt-O3-pipeline.ll | 2 +- llvm/test/Other/opt-Os-pipeline.ll | 2 +- .../DeadStoreElimination/multiblock-loops.ll | 12 +++- 7 files changed, 74 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 94f3e0d..e6b9c2a 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -40,10 +40,12 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" +#include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -853,6 +855,11 @@ struct DSEState { PostDominatorTree &PDT; const TargetLibraryInfo &TLI; const DataLayout &DL; + const LoopInfo &LI; + + // Whether the function contains any irreducible control flow, useful for + // being accurately able to detect loops. + bool ContainsIrreducibleLoops; // All MemoryDefs that potentially could kill other MemDefs. SmallVector MemDefs; @@ -876,14 +883,15 @@ struct DSEState { DenseMap IOLs; DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, - PostDominatorTree &PDT, const TargetLibraryInfo &TLI) + PostDominatorTree &PDT, const TargetLibraryInfo &TLI, + const LoopInfo &LI) : F(F), AA(AA), BatchAA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI), - DL(F.getParent()->getDataLayout()) {} + DL(F.getParent()->getDataLayout()), LI(LI) {} static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, PostDominatorTree &PDT, - const TargetLibraryInfo &TLI) { - DSEState State(F, AA, MSSA, DT, PDT, TLI); + const TargetLibraryInfo &TLI, const LoopInfo &LI) { + DSEState State(F, AA, MSSA, DT, PDT, TLI, LI); // Collect blocks with throwing instructions not modeled in MemorySSA and // alloc-like objects. unsigned PO = 0; @@ -911,6 +919,9 @@ struct DSEState { State.InvisibleToCallerAfterRet.insert({&AI, true}); } + // Collect whether there is any irreducible control flow in the function. + State.ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI); + return State; } @@ -925,6 +936,12 @@ struct DSEState { isOverwrite(const Instruction *LaterI, const Instruction *EarlierI, const MemoryLocation &Later, const MemoryLocation &Earlier, int64_t &EarlierOff, int64_t &LaterOff) { + // AliasAnalysis does not always account for loops. Limit overwrite checks + // to dependencies for which we can guarantee they are independant of any + // loops they are in. + if (!isGuaranteedLoopIndependent(EarlierI, LaterI, Earlier)) + return OW_Unknown; + // FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll // get imprecise values here, though (except for unknown sizes). if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise()) { @@ -1250,11 +1267,27 @@ struct DSEState { return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc)); } - /// Returns true if \p Ptr is guaranteed to be loop invariant for any possible - /// loop. In particular, this guarantees that it only references a single - /// MemoryLocation during execution of the containing function. - bool IsGuaranteedLoopInvariant(Value *Ptr) { - auto IsGuaranteedLoopInvariantBase = [this](Value *Ptr) { + /// Returns true if a dependency between \p Current and \p KillingDef is + /// guaranteed to be loop invariant for the loops that they are in. Either + /// because they are known to be in the same block, in the same loop level or + /// by guaranteeing that \p CurrentLoc only references a single MemoryLocation + /// during execution of the containing function. + bool isGuaranteedLoopIndependent(const Instruction *Current, + const Instruction *KillingDef, + const MemoryLocation &CurrentLoc) { + // If the dependency is within the same block or loop level (being careful of + // irreducible loops), we know that AA will return a valid result for the + // memory dependency. (Both at the function level, outside of any loop, + // would also be valid but we currently disable that to limit compile time). + if (Current->getParent() == KillingDef->getParent()) + return true; + const Loop *CurrentLI = LI.getLoopFor(Current->getParent()); + if (!ContainsIrreducibleLoops && CurrentLI && + CurrentLI == LI.getLoopFor(KillingDef->getParent())) + return true; + + // Otherwise check the memory location is invariant to any loops. + auto IsGuaranteedLoopInvariantBase = [this](const Value *Ptr) { Ptr = Ptr->stripPointerCasts(); if (auto *I = dyn_cast(Ptr)) { if (isa(Ptr)) @@ -1268,7 +1301,7 @@ struct DSEState { return true; }; - Ptr = Ptr->stripPointerCasts(); + const Value *Ptr = CurrentLoc.Ptr->stripPointerCasts(); if (auto *I = dyn_cast(Ptr)) { if (I->getParent()->isEntryBlock()) return true; @@ -1398,9 +1431,9 @@ struct DSEState { // AliasAnalysis does not account for loops. Limit elimination to // candidates for which we can guarantee they always store to the same - // memory location and not multiple locations in a loop. - if (Current->getBlock() != KillingDef->getBlock() && - !IsGuaranteedLoopInvariant(const_cast(CurrentLoc->Ptr))) { + // memory location and not located in different loops. + if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) { + LLVM_DEBUG(dbgs() << " ... not guaranteed loop independent\n"); StepAgain = true; Current = CurrentDef->getDefiningAccess(); WalkerStepLimit -= 1; @@ -1836,12 +1869,13 @@ struct DSEState { } }; -bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, - DominatorTree &DT, PostDominatorTree &PDT, - const TargetLibraryInfo &TLI) { +static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, + DominatorTree &DT, PostDominatorTree &PDT, + const TargetLibraryInfo &TLI, + const LoopInfo &LI) { bool MadeChange = false; - DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI); + DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI, LI); // For each store: for (unsigned I = 0; I < State.MemDefs.size(); I++) { MemoryDef *KillingDef = State.MemDefs[I]; @@ -2014,8 +2048,9 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) { DominatorTree &DT = AM.getResult(F); MemorySSA &MSSA = AM.getResult(F).getMSSA(); PostDominatorTree &PDT = AM.getResult(F); + LoopInfo &LI = AM.getResult(F); - bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI); + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI); #ifdef LLVM_ENABLE_STATS if (AreStatisticsEnabled()) @@ -2029,6 +2064,7 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) { PreservedAnalyses PA; PA.preserveSet(); PA.preserve(); + PA.preserve(); return PA; } @@ -2054,8 +2090,9 @@ public: MemorySSA &MSSA = getAnalysis().getMSSA(); PostDominatorTree &PDT = getAnalysis().getPostDomTree(); + LoopInfo &LI = getAnalysis().getLoopInfo(); - bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI); + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI); #ifdef LLVM_ENABLE_STATS if (AreStatisticsEnabled()) @@ -2077,6 +2114,8 @@ public: AU.addRequired(); AU.addPreserved(); AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } }; @@ -2093,6 +2132,7 @@ INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false, false) diff --git a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll index bce7bfc..c188981 100644 --- a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -515,8 +515,8 @@ ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Memory SSA ; GCN-O2-NEXT: MemCpy Optimization -; GCN-O2-NEXT: Dead Store Elimination ; GCN-O2-NEXT: Natural Loop Information +; GCN-O2-NEXT: Dead Store Elimination ; GCN-O2-NEXT: Canonicalize natural loops ; GCN-O2-NEXT: LCSSA Verifier ; GCN-O2-NEXT: Loop-Closed SSA Form Pass @@ -874,8 +874,8 @@ ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Memory SSA ; GCN-O3-NEXT: MemCpy Optimization -; GCN-O3-NEXT: Dead Store Elimination ; GCN-O3-NEXT: Natural Loop Information +; GCN-O3-NEXT: Dead Store Elimination ; GCN-O3-NEXT: Canonicalize natural loops ; GCN-O3-NEXT: LCSSA Verifier ; GCN-O3-NEXT: Loop-Closed SSA Form Pass diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll index 36370d6..542a798 100644 --- a/llvm/test/Other/opt-O2-pipeline.ll +++ b/llvm/test/Other/opt-O2-pipeline.ll @@ -162,8 +162,8 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MemCpy Optimization -; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass diff --git a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll index c39361d..4c5e126 100644 --- a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll +++ b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll @@ -167,8 +167,8 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MemCpy Optimization -; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll index af77e09..97b11fe 100644 --- a/llvm/test/Other/opt-O3-pipeline.ll +++ b/llvm/test/Other/opt-O3-pipeline.ll @@ -167,8 +167,8 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MemCpy Optimization -; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll index e24d3d3..b65e550 100644 --- a/llvm/test/Other/opt-Os-pipeline.ll +++ b/llvm/test/Other/opt-Os-pipeline.ll @@ -148,8 +148,8 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MemCpy Optimization -; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass diff --git a/llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll b/llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll index 6910209..b187808 100644 --- a/llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll +++ b/llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll @@ -111,7 +111,6 @@ define void @test_loop(i32 %N, i32* noalias nocapture readonly %A, i32* noalias ; CHECK: for.body4.lr.ph: ; CHECK-NEXT: [[I_028:%.*]] = phi i32 [ [[INC11:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[FOR_BODY4_LR_PH_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[I_028]] -; CHECK-NEXT: store i32 0, i32* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_028]], [[N]] ; CHECK-NEXT: br label [[FOR_BODY4:%.*]] ; CHECK: for.body4: @@ -626,10 +625,13 @@ exit: define i16 @partial_override_overloop(i1 %c, i32 %i) { ; CHECK-LABEL: @partial_override_overloop( ; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FIRST:%.*]] +; CHECK: first: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i32 [[I:%.*]] +; CHECK-NEXT: store i16 1, i16* [[ARRAYIDX]], align 1 ; CHECK-NEXT: br label [[DO_BODY:%.*]] ; CHECK: do.body: -; CHECK-NEXT: [[I_0:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[DO_BODY]] ] +; CHECK-NEXT: [[I_0:%.*]] = phi i16 [ 0, [[FIRST]] ], [ [[INC:%.*]], [[DO_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i16 [[I_0]] ; CHECK-NEXT: store i16 2, i16* [[ARRAYIDX2]], align 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i16 [[I_0]], 4 @@ -643,12 +645,16 @@ define i16 @partial_override_overloop(i1 %c, i32 %i) { ; CHECK-NEXT: ret i16 0 ; entry: + ; Branch to first so MemoryLoc is not in the entry block. + br label %first + +first: %arrayidx = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i32 %i store i16 1, i16* %arrayidx, align 1 br label %do.body do.body: - %i.0 = phi i16 [ 0, %entry ], [ %inc, %do.body ] + %i.0 = phi i16 [ 0, %first ], [ %inc, %do.body ] %arrayidx2 = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i16 %i.0 store i16 2, i16* %arrayidx2, align 1 %exitcond = icmp eq i16 %i.0, 4 -- 2.7.4