From 30bb5be38908b0006ed94124515e43774ee37915 Mon Sep 17 00:00:00 2001 From: Hongtao Yu Date: Mon, 26 Apr 2021 09:12:29 -0700 Subject: [PATCH] [CSSPGO] Unblock optimizations with pseudo probe instrumentation part 2. As a follow-up to D95982, this patch continues unblocking optimizations that are blocked by pseudu probe instrumention. The optimizations unblocked are: - In-block load propagation. - In-block dead store elimination - Memory copy optimization that turns stores to consecutive memories into a memset. These optimizations are local to a block, so they shouldn't affect the profile quality. Reviewed By: wmi Differential Revision: https://reviews.llvm.org/D100075 --- llvm/lib/Analysis/Loads.cpp | 4 +-- .../InstCombine/InstCombineLoadStoreAlloca.cpp | 2 +- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp | 7 ++++ llvm/lib/Transforms/Scalar/Sink.cpp | 2 +- .../SampleProfile/pseudo-probe-instcombine.ll | 37 +++++++++++++++++++--- .../SampleProfile/pseudo-probe-memset.ll | 25 +++++++++++++++ 6 files changed, 69 insertions(+), 8 deletions(-) create mode 100644 llvm/test/Transforms/SampleProfile/pseudo-probe-memset.ll diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index da4ee7d..1c55f48 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -532,7 +532,7 @@ Value *llvm::findAvailablePtrLoadStore( // We must ignore debug info directives when counting (otherwise they // would affect codegen). Instruction *Inst = &*--ScanFrom; - if (isa(Inst)) + if (Inst->isDebugOrPseudoInst()) continue; // Restore ScanFrom to expected value in case next test succeeds @@ -620,7 +620,7 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, AAResults &AA, SmallVector MustNotAliasInsts; for (Instruction &Inst : make_range(++Load->getReverseIterator(), ScanBB->rend())) { - if (isa(&Inst)) + if (Inst.isDebugOrPseudoInst()) continue; if (MaxInstsToScan-- == 0) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 77d9105..a1cf8e4 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -1396,7 +1396,7 @@ Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) { --BBI; // Don't count debug info directives, lest they affect codegen, // and we skip pointer-to-pointer bitcasts, which are NOPs. - if (isa(BBI) || + if (BBI->isDebugOrPseudoInst() || (isa(BBI) && BBI->getType()->isPointerTy())) { ScanInsts++; continue; diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 83d475d..29f43f1 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -399,6 +399,13 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, } } + // Calls that only access inaccessible memory do not block merging + // accessible stores. + if (auto *CB = dyn_cast(BI)) { + if (CB->onlyAccessesInaccessibleMemory()) + continue; + } + if (!isa(BI) && !isa(BI)) { // If the instruction is readnone, ignore it, otherwise bail out. We // don't even allow readonly here because we don't want something like: diff --git a/llvm/lib/Transforms/Scalar/Sink.cpp b/llvm/lib/Transforms/Scalar/Sink.cpp index 89cfbe3..8600aac 100644 --- a/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/llvm/lib/Transforms/Scalar/Sink.cpp @@ -202,7 +202,7 @@ static bool ProcessBlock(BasicBlock &BB, DominatorTree &DT, LoopInfo &LI, if (!ProcessedBegin) --I; - if (isa(Inst)) + if (Inst->isDebugOrPseudoInst()) continue; if (SinkInstruction(Inst, Stores, DT, LI, AA)) { diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll index e5bb7bc..bbc0397 100644 --- a/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll @@ -1,4 +1,4 @@ -; RUN: opt -passes=instcombine -S < %s | FileCheck %s +; RUN: opt -passes=instcombine -available-load-scan-limit=2 -S < %s | FileCheck %s %struct.nonbonded = type { [2 x %struct.CompAtom*], [2 x %struct.CompAtomExt*], [2 x %struct.CompAtom*], [2 x %class.Vector*], [2 x %class.Vector*], [2 x i32], %class.Vector, double*, double*, %class.ComputeNonbondedWorkArrays*, %class.Pairlists*, i32, i32, double, double, i32, i32, i32, i32 } %struct.CompAtomExt = type { i32 } @@ -13,11 +13,11 @@ %class.ResizeArrayRaw.3 = type <{ %class.Vector*, i8*, i32, i32, i32, float, i32, [4 x i8] }> %class.Pairlists = type { i16*, i32, i32 } +define dso_local void @merge(%struct.nonbonded* nocapture readonly %params) local_unnamed_addr align 2 { ;; Check the minPart4 and minPart assignments are merged. +; CHECK-LABEL: @merge( ; CHECK-COUNT-1: getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16 ; CHECK-NOT: getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16 - -define dso_local void @_ZN20ComputeNonbondedUtil9calc_pairEP9nonbonded(%struct.nonbonded* nocapture readonly %params) local_unnamed_addr align 2 { entry: %savePairlists3 = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 11 %0 = load i32, i32* %savePairlists3, align 8 @@ -58,7 +58,36 @@ if.else147: ; preds = %if.then138 ret void } -declare dso_local void @_ZN9Pairlists8addIndexEv() align 2 +define i32 @load(i32* nocapture %a, i32* nocapture %b) { +;; Check the last store is deleted. +; CHECK-LABEL: @load( +; CHECK-NEXT: %1 = getelementptr inbounds i32, i32* %a, i64 1 +; CHECK-NEXT: %2 = load i32, i32* %1, align 8 +; CHECK-NEXT: %3 = getelementptr inbounds i32, i32* %b, i64 1 +; CHECK-NEXT: store i32 %2, i32* %3, align 8 +; CHECK-NEXT: call void @llvm.pseudoprobe(i64 5116412291814990879, i64 1, i32 0, i64 -1) +; CHECK-NEXT: ret i32 %[[#]] + %1 = getelementptr inbounds i32, i32* %a, i32 1 + %2 = load i32, i32* %1, align 8 + %3 = getelementptr inbounds i32, i32* %b, i32 1 + store i32 %2, i32* %3, align 8 + %4 = getelementptr inbounds i32, i32* %b, i32 1 + call void @llvm.pseudoprobe(i64 5116412291814990879, i64 1, i32 0, i64 -1) + %5 = load i32, i32* %4, align 8 + ret i32 %5 +} + +define void @dse(i32* %p) { +;; Check the first store is deleted. +; CHECK-LABEL: @dse( +; CHECK-NEXT: call void @llvm.pseudoprobe(i64 5116412291814990879, i64 1, i32 0, i64 -1) +; CHECK-NEXT: store i32 0, i32* [[P:%.*]], align 4 +; CHECK-NEXT: ret void + store i32 0, i32* %p + call void @llvm.pseudoprobe(i64 5116412291814990879, i64 1, i32 0, i64 -1) + store i32 0, i32* %p + ret void +} ; Function Attrs: inaccessiblememonly nounwind willreturn declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0 diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-memset.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-memset.ll new file mode 100644 index 0000000..0b2b530 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-memset.ll @@ -0,0 +1,25 @@ +; RUN: opt < %s -memcpyopt -S | FileCheck %s + +%struct.MV = type { i16, i16 } + +define void @test(i32* nocapture %c) nounwind optsize { +; All the stores in this example should be merged into a single memset. +; CHECK-NOT: store i32 -1 +; CHECK: call void @llvm.memset.p0i8.i64 + store i32 -1, i32* %c, align 4 + %1 = getelementptr inbounds i32, i32* %c, i32 1 + store i32 -1, i32* %1, align 4 + %2 = getelementptr inbounds i32, i32* %c, i32 2 + store i32 -1, i32* %2, align 4 + call void @llvm.pseudoprobe(i64 5116412291814990879, i64 1, i32 0, i64 -1) + %3 = getelementptr inbounds i32, i32* %c, i32 3 + store i32 -1, i32* %3, align 4 + %4 = getelementptr inbounds i32, i32* %c, i32 4 + store i32 -1, i32* %4, align 4 + ret void +} + +; Function Attrs: inaccessiblememonly nounwind willreturn +declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0 + +attributes #0 = { inaccessiblememonly nounwind willreturn } -- 2.7.4