From 67671024c8cbc3d56c173073113b158b2934487b Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 10 Jun 2020 10:23:37 +0100 Subject: [PATCH] [DSE,MSSA] Relax post-dom restriction for objs visible after return. This patch relaxes the post-dominance requirement for accesses to objects visible after the function returns. Instead of requiring the killing def to post-dominate the access to eliminate, the set of 'killing blocks' (= blocks that completely overwrite the original access) is collected. If all paths from the access to eliminate and an exit block go through a killing block, the access can be removed. To check this property, we first get the common post-dominator block for the killing blocks. If this block does not post-dominate the access block, there may be a path from DomAccess to an exit block not involving any killing block. Otherwise we have to check if there is a path from the DomAccess to the common post-dominator, that does not contain a killing block. If there is no such path, we can remove DomAccess. For this check, we start at the common post-dominator and then traverse the CFG backwards. Paths are terminated when we hit a killing block or a block that is not executed between DomAccess and a killing block according to the post-order numbering (if the post order number of a block is greater than the one of DomAccess, the block cannot be in in a path starting at DomAccess). This gives the following improvements on the total number of stores after DSE for MultiSource, SPEC2K, SPEC2006: Tests: 237 Same hash: 206 (filtered out) Remaining: 31 Metric: dse.NumRemainingStores Program base new100 diff test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2% test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6% test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4% test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2% test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0% test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5% test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5% test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3% test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3% test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2% test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2% test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1% test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1% test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1% test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1% test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1% test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1% test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0% test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0% test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0% test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0% test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0% test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0% test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0% test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0% test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0% There might be potential to pre-compute some of the information of which blocks are on the path to an exit for each block, but the overall benefit might be comparatively small. On the set of benchmarks, 15738 times out of 20322 we reach the CFG check, the CFG check is successful. The total number of iterations in the CFG check is 187810, so on average we need less than 10 steps in the check loop. Bumping the threshold in the loop from 50 to 150 gives a few small improvements, but I don't think they warrant such a big bump at the moment. This is all pending further tuning in the future. Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv Reviewed By: george.burgess.iv Differential Revision: https://reviews.llvm.org/D78932 --- .../lib/Transforms/Scalar/DeadStoreElimination.cpp | 120 ++++++++++++++++++--- .../MSSA/multiblock-memintrinsics.ll | 3 +- .../MSSA/multiblock-multipath.ll | 21 ++-- .../DeadStoreElimination/MSSA/multiblock-simple.ll | 6 +- 4 files changed, 121 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index daf17e4..7c40fc5 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -83,6 +83,9 @@ STATISTIC(NumFastOther, "Number of other instrs removed"); STATISTIC(NumCompletePartials, "Number of stores dead by later partials"); STATISTIC(NumModifiedStores, "Number of stores modified"); STATISTIC(NumNoopStores, "Number of noop stores deleted"); +STATISTIC(NumCFGChecks, "Number of stores modified"); +STATISTIC(NumCFGTries, "Number of stores modified"); +STATISTIC(NumCFGSuccess, "Number of stores modified"); DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa", "Controls which MemoryDefs are eliminated."); @@ -111,6 +114,11 @@ static cl::opt MemorySSADefsPerBlockLimit( cl::desc("The number of MemoryDefs we consider as candidates to eliminated " "other stores per basic block (default = 5000)")); +static cl::opt MemorySSAPathCheckLimit( + "dse-memoryssa-path-check-limit", cl::init(50), cl::Hidden, + cl::desc("The maximum number of blocks to check when trying to prove that " + "all paths to an exit go through a killing block (default = 50)")); + //===----------------------------------------------------------------------===// // Helper functions //===----------------------------------------------------------------------===// @@ -1591,15 +1599,15 @@ struct DSEState { if (CB->onlyAccessesInaccessibleMemory()) return false; - ModRefInfo MR = AA.getModRefInfo(UseInst, DefLoc); - // If necessary, perform additional analysis. - if (isModSet(MR) && isa(UseInst)) - MR = AA.callCapturesBefore(UseInst, DefLoc, &DT); + int64_t InstWriteOffset, DepWriteOffset; + auto CC = getLocForWriteEx(UseInst); + InstOverlapIntervalsTy IOL; + + const DataLayout &DL = F.getParent()->getDataLayout(); - Optional UseLoc = getLocForWriteEx(UseInst); - return isModSet(MR) && isMustSet(MR) && - (UseLoc->Size.hasValue() && DefLoc.Size.hasValue() && - UseLoc->Size.getValue() >= DefLoc.Size.getValue()); + return CC && + isOverwrite(DefLoc, *CC, DL, TLI, DepWriteOffset, InstWriteOffset, + UseInst, IOL, AA, &F) == OW_Complete; } /// Returns true if \p Use may read from \p DefLoc. @@ -1653,19 +1661,20 @@ struct DSEState { if (isa(DomAccess)) break; - // Check if we can skip DomDef for DSE. For accesses to objects that are - // accessible after the function returns, KillingDef must execute whenever - // DomDef executes and use post-dominance to ensure that. + // Check if we can skip DomDef for DSE. MemoryDef *DomDef = dyn_cast(DomAccess); - if ((DomDef && canSkipDef(DomDef, DefVisibleToCallerBeforeRet)) || - (DefVisibleToCallerAfterRet && - !PDT.dominates(KillingDef->getBlock(), DomDef->getBlock()))) { + if (DomDef && canSkipDef(DomDef, DefVisibleToCallerBeforeRet)) { StepAgain = true; Current = DomDef->getDefiningAccess(); } } while (StepAgain); + // Accesses to objects accessible after the function returns can only be + // eliminated if the access is killed along all paths to the exit. Collect + // the blocks with killing (=completely overwriting MemoryDefs) and check if + // they cover all paths from DomAccess to any function exit. + SmallPtrSet KillingBlocks = {KillingDef->getBlock()}; LLVM_DEBUG({ dbgs() << " Checking for reads of " << *DomAccess; if (isa(DomAccess)) @@ -1733,11 +1742,92 @@ struct DSEState { // 3 = Def(1) ; <---- Current (3, 2) = NoAlias, (3,1) = MayAlias, // stores [0,1] if (MemoryDef *UseDef = dyn_cast(UseAccess)) { - if (!isCompleteOverwrite(DefLoc, UseInst)) + if (isCompleteOverwrite(DefLoc, UseInst)) { + if (DefVisibleToCallerAfterRet && UseAccess != DomAccess) { + BasicBlock *MaybeKillingBlock = UseInst->getParent(); + if (PostOrderNumbers.find(MaybeKillingBlock)->second < + PostOrderNumbers.find(DomAccess->getBlock())->second) { + + LLVM_DEBUG(dbgs() << " ... found killing block " + << MaybeKillingBlock->getName() << "\n"); + KillingBlocks.insert(MaybeKillingBlock); + } + } + } else PushMemUses(UseDef); } } + // For accesses to locations visible after the function returns, make sure + // that the location is killed (=overwritten) along all paths from DomAccess + // to the exit. + if (DefVisibleToCallerAfterRet) { + assert(!KillingBlocks.empty() && + "Expected at least a single killing block"); + // Find the common post-dominator of all killing blocks. + BasicBlock *CommonPred = *KillingBlocks.begin(); + for (auto I = std::next(KillingBlocks.begin()), E = KillingBlocks.end(); + I != E; I++) { + if (!CommonPred) + break; + CommonPred = PDT.findNearestCommonDominator(CommonPred, *I); + } + + // If CommonPred is in the set of killing blocks, just check if it + // post-dominates DomAccess. + if (KillingBlocks.count(CommonPred)) { + if (PDT.dominates(CommonPred, DomAccess->getBlock())) + return {DomAccess}; + return None; + } + + // If the common post-dominator does not post-dominate DomAccess, there + // is a path from DomAccess to an exit not going through a killing block. + if (PDT.dominates(CommonPred, DomAccess->getBlock())) { + SetVector WorkList; + + // DomAccess's post-order number provides an upper bound of the blocks + // on a path starting at DomAccess. + unsigned UpperBound = + PostOrderNumbers.find(DomAccess->getBlock())->second; + + // If CommonPred is null, there are multiple exits from the function. + // They all have to be added to the worklist. + if (CommonPred) + WorkList.insert(CommonPred); + else + for (BasicBlock *R : PDT.getRoots()) { + if (!DT.isReachableFromEntry(R)) + continue; + WorkList.insert(R); + } + + NumCFGTries++; + // Check if all paths starting from an exit node go through one of the + // killing blocks before reaching DomAccess. + for (unsigned I = 0; I < WorkList.size(); I++) { + NumCFGChecks++; + BasicBlock *Current = WorkList[I]; + if (KillingBlocks.count(Current)) + continue; + if (Current == DomAccess->getBlock()) + return None; + unsigned CPO = PostOrderNumbers.find(Current)->second; + // Current block is not on a path starting at DomAccess. + if (CPO > UpperBound) + continue; + for (BasicBlock *Pred : predecessors(Current)) + WorkList.insert(Pred); + + if (WorkList.size() >= MemorySSAPathCheckLimit) + return None; + } + NumCFGSuccess++; + return {DomAccess}; + } + return None; + } + // No aliasing MemoryUses of DomAccess found, DomAccess is potentially dead. return {DomAccess}; } diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memintrinsics.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memintrinsics.ll index 4500c31..bb7c556 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memintrinsics.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memintrinsics.ll @@ -46,7 +46,8 @@ define void @accessible_after_return_2(i32* noalias %P, i1 %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* -; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i1 false) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i1 false) ; CHECK-NEXT: br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-multipath.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-multipath.ll index 78f97d9..7efd6ef 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-multipath.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-multipath.ll @@ -9,10 +9,9 @@ declare void @use(i32 *) define void @accessible_after_return_1(i32* noalias %P, i1 %c1) { ; CHECK-LABEL: @accessible_after_return_1( -; CHECK-NEXT: store i32 1, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br i1 [[C1:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: -; CHECK-NEXT: store i32 0, i32* [[P]], align 4 +; CHECK-NEXT: store i32 0, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br label [[BB5:%.*]] ; CHECK: bb2: ; CHECK-NEXT: store i32 3, i32* [[P]], align 4 @@ -38,10 +37,9 @@ bb5: define void @accessible_after_return_2(i32* noalias %P, i1 %c.1, i1 %c.2) { ; CHECK-LABEL: @accessible_after_return_2( -; CHECK-NEXT: store i32 1, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br i1 [[C_1:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: -; CHECK-NEXT: store i32 0, i32* [[P]], align 4 +; CHECK-NEXT: store i32 0, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br label [[BB5:%.*]] ; CHECK: bb2: ; CHECK-NEXT: br i1 [[C_2:%.*]], label [[BB3:%.*]], label [[BB4:%.*]] @@ -77,6 +75,8 @@ bb5: ret void } +; Cannot remove store in entry block because it is not overwritten on path +; entry->bb2->bb5. define void @accessible_after_return_3(i32* noalias %P, i1 %c1) { ; CHECK-LABEL: @accessible_after_return_3( ; CHECK-NEXT: store i32 1, i32* [[P:%.*]], align 4 @@ -105,6 +105,8 @@ bb5: ret void } +; Cannot remove store in entry block because it is not overwritten on path +; entry->bb2->bb5. define void @accessible_after_return_4(i32* noalias %P, i1 %c1) { ; CHECK-LABEL: @accessible_after_return_4( ; CHECK-NEXT: store i32 1, i32* [[P:%.*]], align 4 @@ -179,12 +181,11 @@ bb5: define void @accessible_after_return6(i32* %P, i1 %c.1, i1 %c.2) { ; CHECK-LABEL: @accessible_after_return6( ; CHECK-NEXT: entry: -; CHECK-NEXT: store i32 0, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br i1 [[C_1:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: ; CHECK-NEXT: br i1 [[C_2:%.*]], label [[BB3:%.*]], label [[BB4:%.*]] ; CHECK: bb2: -; CHECK-NEXT: store i32 1, i32* [[P]], align 4 +; CHECK-NEXT: store i32 1, i32* [[P:%.*]], align 4 ; CHECK-NEXT: ret void ; CHECK: bb3: ; CHECK-NEXT: store i32 2, i32* [[P]], align 4 @@ -220,10 +221,9 @@ define void @accessible_after_return7(i32* %P, i1 %c.1, i1 %c.2) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C_1:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: -; CHECK-NEXT: store i32 0, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br i1 [[C_2:%.*]], label [[BB3:%.*]], label [[BB4:%.*]] ; CHECK: bb3: -; CHECK-NEXT: store i32 2, i32* [[P]], align 4 +; CHECK-NEXT: store i32 2, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br label [[BB5:%.*]] ; CHECK: bb4: ; CHECK-NEXT: store i32 1, i32* [[P]], align 4 @@ -257,7 +257,7 @@ bb5: ; Cannot remove store in entry block, because it is overwritten along each path to -; the exit (entry->bb1->bb5->bb5). +; the exit (entry->bb1->bb4->bb5). define void @accessible_after_return8(i32* %P, i1 %c.1, i1 %c.2) { ; CHECK-LABEL: @accessible_after_return8( ; CHECK-NEXT: entry: @@ -353,6 +353,9 @@ for.end: ret void } +; Cannot remove store in entry block because it is not overwritten on path +; entry->bb2->bb4. Also make sure we deal with dead exit blocks without +; crashing. define void @accessible_after_return10_dead_block(i32* %P, i1 %c.1, i1 %c.2) { ; CHECK-LABEL: @accessible_after_return10_dead_block( ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll index 09abd72..718e55c 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll @@ -199,10 +199,9 @@ bb3: define void @test12(i32* %P) { ; CHECK-LABEL: @test12( -; CHECK-NEXT: store i32 0, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: -; CHECK-NEXT: store i32 1, i32* [[P]], align 4 +; CHECK-NEXT: store i32 1, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: ; CHECK-NEXT: store i32 1, i32* [[P]], align 4 @@ -225,10 +224,9 @@ bb3: define void @test13(i32* %P) { ; CHECK-LABEL: @test13( -; CHECK-NEXT: store i32 0, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: -; CHECK-NEXT: store i32 1, i32* [[P]], align 4 +; CHECK-NEXT: store i32 1, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: ; CHECK-NEXT: store i32 1, i32* [[P]], align 4 -- 2.7.4