From 5573abceab5eb9c6964b7249eae21f80a88ebf2e Mon Sep 17 00:00:00 2001 From: Evgeniy Brevnov Date: Thu, 16 Jan 2020 15:17:43 +0700 Subject: [PATCH] [DependenceAnalysis] Dependecies for loads marked with "ivnariant.load" should not be shared with general accesses(PR42151). Summary: This is second attempt to fix the problem with incorrect dependencies reported in presence of invariant load. Initial fix (https://reviews.llvm.org/D64405) was reverted due to a regression reported in https://reviews.llvm.org/D70516. The original fix changed caching behavior for invariant loads. Namely such loads are not put into the second level cache (NonLocalDepInfo). The problem with that fix is the first level cache (CachedNonLocalPointerInfo) still works as if invariant loads were in the second level cache. The solution is in addition to not putting dependence results into the second level cache avoid putting info about invariant loads into the first level cache as well. Reviewers: jdoerfert, reames, hfinkel, efriedma Reviewed By: jdoerfert Subscribers: DaniilSuchkov, hiraditya, bmahjour, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D73027 --- llvm/lib/Analysis/MemoryDependenceAnalysis.cpp | 80 +++++++--- .../MemoryDependenceAnalysis/InvariantLoad.ll | 173 +++++++++++++++++++++ 2 files changed, 229 insertions(+), 24 deletions(-) create mode 100644 llvm/test/Analysis/MemoryDependenceAnalysis/InvariantLoad.ll diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp index 450595c..ce9944a 100644 --- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -902,6 +902,11 @@ MemDepResult MemoryDependenceResults::GetNonLocalInfoForBlock( Instruction *QueryInst, const MemoryLocation &Loc, bool isLoad, BasicBlock *BB, NonLocalDepInfo *Cache, unsigned NumSortedEntries) { + bool isInvariantLoad = false; + + if (LoadInst *LI = dyn_cast_or_null(QueryInst)) + isInvariantLoad = LI->getMetadata(LLVMContext::MD_invariant_load); + // Do a binary search to see if we already have an entry for this block in // the cache set. If so, find it. NonLocalDepInfo::iterator Entry = std::upper_bound( @@ -913,6 +918,13 @@ MemDepResult MemoryDependenceResults::GetNonLocalInfoForBlock( if (Entry != Cache->begin() + NumSortedEntries && Entry->getBB() == BB) ExistingResult = &*Entry; + // Use cached result for invariant load only if there is no dependency for non + // invariant load. In this case invariant load can not have any dependency as + // well. + if (ExistingResult && isInvariantLoad && + !ExistingResult->getResult().isNonFuncLocal()) + ExistingResult = nullptr; + // If we have a cached entry, and it is non-dirty, use it as the value for // this dependency. if (ExistingResult && !ExistingResult->getResult().isDirty()) { @@ -941,6 +953,10 @@ MemDepResult MemoryDependenceResults::GetNonLocalInfoForBlock( MemDepResult Dep = getPointerDependencyFrom(Loc, isLoad, ScanPos, BB, QueryInst); + // Don't cache results for invariant load. + if (isInvariantLoad) + return Dep; + // If we had a dirty entry for the block, update it. Otherwise, just add // a new entry. if (ExistingResult) @@ -1029,6 +1045,10 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB( InitialNLPI.Size = Loc.Size; InitialNLPI.AATags = Loc.AATags; + bool isInvariantLoad = false; + if (LoadInst *LI = dyn_cast_or_null(QueryInst)) + isInvariantLoad = LI->getMetadata(LLVMContext::MD_invariant_load); + // Get the NLPI for CacheKey, inserting one into the map if it doesn't // already have one. std::pair Pair = @@ -1037,7 +1057,8 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB( // If we already have a cache entry for this CacheKey, we may need to do some // work to reconcile the cache entry and the current query. - if (!Pair.second) { + // Invariant loads don't participate in caching. Thus no need to reconcile. + if (!isInvariantLoad && !Pair.second) { if (CacheInfo->Size != Loc.Size) { bool ThrowOutEverything; if (CacheInfo->Size.hasValue() && Loc.Size.hasValue()) { @@ -1093,7 +1114,10 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB( // If we have valid cached information for exactly the block we are // investigating, just return it with no recomputation. - if (CacheInfo->Pair == BBSkipFirstBlockPair(StartBB, SkipFirstBlock)) { + // Don't use cached information for invariant loads since it is valid for + // non-invariant loads only. + if (!isInvariantLoad && + CacheInfo->Pair == BBSkipFirstBlockPair(StartBB, SkipFirstBlock)) { // We have a fully cached result for this query then we can just return the // cached results and populate the visited set. However, we have to verify // that we don't already have conflicting results for these blocks. Check @@ -1129,14 +1153,18 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB( return true; } - // Otherwise, either this is a new block, a block with an invalid cache - // pointer or one that we're about to invalidate by putting more info into it - // than its valid cache info. If empty, the result will be valid cache info, - // otherwise it isn't. - if (Cache->empty()) - CacheInfo->Pair = BBSkipFirstBlockPair(StartBB, SkipFirstBlock); - else - CacheInfo->Pair = BBSkipFirstBlockPair(); + // Invariant loads don't affect cache in any way thus no need to update + // CacheInfo as well. + if (!isInvariantLoad) { + // Otherwise, either this is a new block, a block with an invalid cache + // pointer or one that we're about to invalidate by putting more info into + // it than its valid cache info. If empty, the result will be valid cache + // info, otherwise it isn't. + if (Cache->empty()) + CacheInfo->Pair = BBSkipFirstBlockPair(StartBB, SkipFirstBlock); + else + CacheInfo->Pair = BBSkipFirstBlockPair(); + } SmallVector Worklist; Worklist.push_back(StartBB); @@ -1377,22 +1405,26 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB( if (SkipFirstBlock) return false; - bool foundBlock = false; - for (NonLocalDepEntry &I : llvm::reverse(*Cache)) { - if (I.getBB() != BB) - continue; + // Results of invariant loads are not cached thus no need to update cached + // information. + if (!isInvariantLoad) { + bool foundBlock = false; + for (NonLocalDepEntry &I : llvm::reverse(*Cache)) { + if (I.getBB() != BB) + continue; - assert((GotWorklistLimit || I.getResult().isNonLocal() || - !DT.isReachableFromEntry(BB)) && - "Should only be here with transparent block"); - foundBlock = true; - I.setResult(MemDepResult::getUnknown()); - Result.push_back( - NonLocalDepResult(I.getBB(), I.getResult(), Pointer.getAddr())); - break; + assert((GotWorklistLimit || I.getResult().isNonLocal() || + !DT.isReachableFromEntry(BB)) && + "Should only be here with transparent block"); + foundBlock = true; + I.setResult(MemDepResult::getUnknown()); + Result.push_back( + NonLocalDepResult(I.getBB(), I.getResult(), Pointer.getAddr())); + break; + } + (void)foundBlock; (void)GotWorklistLimit; + assert((foundBlock || GotWorklistLimit) && "Current block not in cache?"); } - (void)foundBlock; (void)GotWorklistLimit; - assert((foundBlock || GotWorklistLimit) && "Current block not in cache?"); } // Okay, we're done now. If we added new values to the cache, re-sort it. diff --git a/llvm/test/Analysis/MemoryDependenceAnalysis/InvariantLoad.ll b/llvm/test/Analysis/MemoryDependenceAnalysis/InvariantLoad.ll new file mode 100644 index 0000000..152cb17 --- /dev/null +++ b/llvm/test/Analysis/MemoryDependenceAnalysis/InvariantLoad.ll @@ -0,0 +1,173 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -gvn -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" +target triple = "x86_64-unknown-linux-gnu" + +declare void @llvm.memset.p0i8.i8(i8*, i8, i32, i1) +declare void @foo(i8*) + +define i8 @test(i1 %cmp) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = alloca i8 +; CHECK-NEXT: store i8 5, i8* [[P]] +; CHECK-NEXT: br label [[HEADER:%.*]] +; CHECK: header: +; CHECK-NEXT: [[V:%.*]] = phi i8 [ 5, [[ENTRY:%.*]] ], [ -5, [[ALIVE:%.*]] ] +; CHECK-NEXT: [[I:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[I_INC:%.*]], [[ALIVE]] ] +; CHECK-NEXT: br i1 [[CMP:%.*]], label [[ALIVE]], label [[DEAD:%.*]] +; CHECK: dead: +; CHECK-NEXT: call void @foo(i8* [[P]]) +; CHECK-NEXT: [[I_1:%.*]] = add i8 [[I]], [[V]] +; CHECK-NEXT: br label [[ALIVE]] +; CHECK: alive: +; CHECK-NEXT: [[I_2:%.*]] = phi i8 [ [[I]], [[HEADER]] ], [ [[I_1]], [[DEAD]] ] +; CHECK-NEXT: store i8 -5, i8* [[P]] +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 1 [[P]], i8 0, i32 1, i1 false) +; CHECK-NEXT: [[I_INC]] = add i8 [[I_2]], 1 +; CHECK-NEXT: [[CMP_LOOP:%.*]] = icmp ugt i8 [[I_INC]], 100 +; CHECK-NEXT: br i1 [[CMP_LOOP]], label [[EXIT:%.*]], label [[HEADER]] +; CHECK: exit: +; CHECK-NEXT: ret i8 0 +; + +entry: + %p = alloca i8 + %addr = getelementptr inbounds i8, i8* %p, i64 0 + store i8 5, i8* %addr + br label %header +header: + %i = phi i8 [0, %entry], [%i.inc, %backedge] + br i1 %cmp, label %alive, label %dead +dead: + call void @foo(i8* %p) + %v = load i8, i8* %addr, !invariant.load !1 + %i.1 = add i8 %i, %v + br label %alive +alive: + %i.2 = phi i8 [%i, %header], [%i.1, %dead] + store i8 -5, i8* %addr + br label %backedge +backedge: + call void @llvm.memset.p0i8.i8(i8 * align 1 %p, i8 0, i32 1, i1 false) + %i.inc = add i8 %i.2, 1 + %cmp.loop = icmp ugt i8 %i.inc, 100 + br i1 %cmp.loop, label %exit, label %header +exit: + %res = load i8, i8* %addr + ret i8 %res +} + +; Check that first two loads are not optimized out while the one marked with +; invariant.load reuses %res1 +define i8 @test2(i1 %cmp, i8 *%p) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RES1:%.*]] = load i8, i8* [[P:%.*]] +; CHECK-NEXT: call void @foo(i8* [[P]]) +; CHECK-NEXT: br i1 [[CMP:%.*]], label [[B2:%.*]], label [[B1:%.*]] +; CHECK: b1: +; CHECK-NEXT: [[RES2:%.*]] = load i8, i8* [[P]] +; CHECK-NEXT: [[RES3:%.*]] = add i8 [[RES1]], [[RES2]] +; CHECK-NEXT: br label [[ALIVE:%.*]] +; CHECK: b2: +; CHECK-NEXT: [[RES_DEAD:%.*]] = add i8 [[RES1]], [[RES1]] +; CHECK-NEXT: br label [[ALIVE]] +; CHECK: alive: +; CHECK-NEXT: [[RES_PHI:%.*]] = phi i8 [ [[RES3]], [[B1]] ], [ [[RES_DEAD]], [[B2]] ] +; CHECK-NEXT: ret i8 [[RES_PHI]] +; + +entry: + %res1 = load i8, i8* %p + call void @foo(i8 *%p) + br i1 %cmp, label %b2, label %b1 +b1: + %res2 = load i8, i8* %p + %res3 = add i8 %res1, %res2 + br label %alive +b2: + %v = load i8, i8* %p, !invariant.load !1 + %res.dead = add i8 %v, %res1 + br label %alive +alive: + %res.phi = phi i8 [%res3, %b1], [%res.dead, %b2] + ret i8 %res.phi +} + +; This is essentially the same test case as the above one but with %b1 and %b2 +; swapped in "br i1 %cmp, label %b1, label %b2" instruction. That helps us to +; ensure that results doesn't depend on visiting order. +define i8 @test3(i1 %cmp, i8 *%p) { +; CHECK-LABEL: @test3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RES1:%.*]] = load i8, i8* [[P:%.*]] +; CHECK-NEXT: call void @foo(i8* [[P]]) +; CHECK-NEXT: br i1 [[CMP:%.*]], label [[B1:%.*]], label [[B2:%.*]] +; CHECK: b1: +; CHECK-NEXT: [[RES2:%.*]] = load i8, i8* [[P]] +; CHECK-NEXT: [[RES3:%.*]] = add i8 [[RES1]], [[RES2]] +; CHECK-NEXT: br label [[ALIVE:%.*]] +; CHECK: b2: +; CHECK-NEXT: [[RES_DEAD:%.*]] = add i8 [[RES1]], [[RES1]] +; CHECK-NEXT: br label [[ALIVE]] +; CHECK: alive: +; CHECK-NEXT: [[RES_PHI:%.*]] = phi i8 [ [[RES3]], [[B1]] ], [ [[RES_DEAD]], [[B2]] ] +; CHECK-NEXT: ret i8 [[RES_PHI]] +; +entry: + %res1 = load i8, i8* %p + call void @foo(i8 *%p) + br i1 %cmp, label %b1, label %b2 +b1: + %res2 = load i8, i8* %p + %res3 = add i8 %res1, %res2 + br label %alive +b2: + %v = load i8, i8* %p, !invariant.load !1 + %res.dead = add i8 %v, %res1 + br label %alive +alive: + %res.phi = phi i8 [%res3, %b1], [%res.dead, %b2] + ret i8 %res.phi +} + + +; This is reduced test case catching regression in the first version of the +; fix for invariant loads (https://reviews.llvm.org/D64405). +define void @test4() { +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* inttoptr (i64 8 to float*), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fmul float [[TMP0]], [[TMP0]] +; CHECK-NEXT: br label [[FUSION_LOOP_HEADER_DIM_1_PREHEADER:%.*]] +; CHECK: fusion.loop_header.dim.1.preheader: +; CHECK-NEXT: [[TMP2:%.*]] = phi float [ [[TMP0]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FUSION_LOOP_HEADER_DIM_1_PREHEADER]] ] +; CHECK-NEXT: [[FUSION_INVAR_ADDRESS_DIM_0_03:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INVAR_INC3:%.*]], [[FUSION_LOOP_HEADER_DIM_1_PREHEADER]] ] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 [[FUSION_INVAR_ADDRESS_DIM_0_03]], i64 0, i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = fmul float [[TMP2]], [[TMP2]] +; CHECK-NEXT: [[INVAR_INC3]] = add nuw nsw i64 [[FUSION_INVAR_ADDRESS_DIM_0_03]], 1 +; CHECK-NEXT: [[DOTPHI_TRANS_INSERT:%.*]] = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 [[INVAR_INC3]], i64 0, i64 2 +; CHECK-NEXT: [[DOTPRE]] = load float, float* [[DOTPHI_TRANS_INSERT]], align 4, !invariant.load !0 +; CHECK-NEXT: br label [[FUSION_LOOP_HEADER_DIM_1_PREHEADER]] +; +entry: + %0 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 0, i64 0, i64 2 + %1 = load float, float* %0, align 4 + %2 = fmul float %1, %1 + br label %fusion.loop_header.dim.1.preheader + +fusion.loop_header.dim.1.preheader: ; preds = %fusion.loop_header.dim.1.preheader, %entry + %fusion.invar_address.dim.0.03 = phi i64 [ 0, %entry ], [ %invar.inc3, %fusion.loop_header.dim.1.preheader ] + %3 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 %fusion.invar_address.dim.0.03, i64 0, i64 2 + %4 = load float, float* %3, align 4, !invariant.load !1 + %5 = fmul float %4, %4 + %6 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 %fusion.invar_address.dim.0.03, i64 0, i64 2 + %7 = load float, float* %6, align 4, !invariant.load !1 + %8 = fmul float %7, %7 + %invar.inc3 = add nuw nsw i64 %fusion.invar_address.dim.0.03, 1 + br label %fusion.loop_header.dim.1.preheader +} + +!1 = !{} -- 2.7.4