From 220f6e5271f2e6b39bf4e083c03cd3f91bb43685 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Wed, 28 Apr 2021 15:20:04 -0700 Subject: [PATCH] [SimplifyCFG] Ignore ephemeral values when counting insts for threading Ignore ephemeral values (only feeding llvm.assume intrinsics) when computing the instruction count to decide if a block is small enough for threading. This is similar to the handling of these values in the InlineCost computation. These instructions will eventually be removed and shouldn't count against code size (similar to the existing ignoring of phis). Without this change, when enabling -fwhole-program-vtables, which causes type test / assume sequences to be inserted by clang, we can get different threading decisions. In particular, when building with instrumentation FDO it can affect the optimizations decisions before FDO matching, leading to some mismatches. Differential Revision: https://reviews.llvm.org/D101494 --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 32 +++++++---- .../test/Transforms/SimplifyCFG/unprofitable-pr.ll | 63 +++++++++++++++++++++- 2 files changed, 84 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index be108c0..7e96cc8 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -151,9 +151,10 @@ static cl::opt MaxSpeculationDepth( "speculatively executed instructions")); static cl::opt -MaxSmallBlockSize("simplifycfg-max-small-block-size", cl::Hidden, cl::init(10), - cl::desc("Max size of a block which is still considered " - "small enough to thread through")); + MaxSmallBlockSize("simplifycfg-max-small-block-size", cl::Hidden, + cl::init(10), + cl::desc("Max size of a block which is still considered " + "small enough to thread through")); // Two is chosen to allow one negation and a logical combine. static cl::opt @@ -2527,19 +2528,32 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) { int Size = 0; - for (Instruction &I : BB->instructionsWithoutDebug()) { - if (Size > MaxSmallBlockSize) - return false; // Don't clone large BB's. + SmallPtrSet EphValues; + auto IsEphemeral = [&](const Value *V) { + if (isa(V)) + return true; + return isSafeToSpeculativelyExecute(V) && + all_of(V->users(), + [&](const User *U) { return EphValues.count(U); }); + }; + // Walk the loop in reverse so that we can identify ephemeral values properly + // (values only feeding assumes). + for (Instruction &I : reverse(BB->instructionsWithoutDebug())) { // Can't fold blocks that contain noduplicate or convergent calls. if (CallInst *CI = dyn_cast(&I)) if (CI->cannotDuplicate() || CI->isConvergent()) return false; + // Ignore ephemeral values which are deleted during codegen. + if (IsEphemeral(&I)) + EphValues.insert(&I); // We will delete Phis while threading, so Phis should not be accounted in - // block's size - if (!isa(I)) - ++Size; + // block's size. + else if (!isa(I)) { + if (Size++ > MaxSmallBlockSize) + return false; // Don't clone large BB's. + } // We can only support instructions that do not define values that are // live outside of the current basic block. diff --git a/llvm/test/Transforms/SimplifyCFG/unprofitable-pr.ll b/llvm/test/Transforms/SimplifyCFG/unprofitable-pr.ll index 283922f..8801d57 100644 --- a/llvm/test/Transforms/SimplifyCFG/unprofitable-pr.ll +++ b/llvm/test/Transforms/SimplifyCFG/unprofitable-pr.ll @@ -1,10 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -simplifycfg-max-small-block-size=10 -S < %s | FileCheck %s -; RUN: opt -passes=simplify-cfg -simplifycfg-max-small-block-size=10 -S < %s | FileCheck %s +; RUN: opt -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -simplifycfg-max-small-block-size=6 -S < %s | FileCheck %s +; RUN: opt -passes=simplify-cfg -simplifycfg-max-small-block-size=6 -S < %s | FileCheck %s target datalayout = "e-p:64:64-p5:32:32-A5" declare void @llvm.assume(i1) +declare i1 @llvm.type.test(i8*, metadata) nounwind readnone define void @test_01(i1 %c, i64* align 1 %ptr) local_unnamed_addr #0 { ; CHECK-LABEL: @test_01( @@ -165,3 +166,61 @@ false2: ; preds = %true1 store volatile i64 3, i64* %ptr, align 8 ret void } + +; Try the max block size for PRE again but with the bitcast/type test/assume +; sequence used for whole program devirt. +define void @test_04(i1 %c, i64* align 1 %ptr, [3 x i8*]* %vtable) local_unnamed_addr #0 { +; CHECK-LABEL: @test_04( +; CHECK-NEXT: br i1 [[C:%.*]], label [[TRUE2_CRITEDGE:%.*]], label [[FALSE1:%.*]] +; CHECK: false1: +; CHECK-NEXT: store volatile i64 1, i64* [[PTR:%.*]], align 4 +; CHECK-NEXT: [[VTABLE:%.*]] = bitcast [3 x i8*]* %vtable to i8* +; CHECK-NEXT: [[P:%.*]] = call i1 @llvm.type.test(i8* [[VTABLE]], metadata !"foo") +; CHECK-NEXT: tail call void @llvm.assume(i1 [[P]]) +; CHECK-NEXT: store volatile i64 0, i64* [[PTR]], align 8 +; CHECK-NEXT: store volatile i64 -1, i64* [[PTR]], align 8 +; CHECK-NEXT: store volatile i64 -1, i64* [[PTR]], align 8 +; CHECK-NEXT: store volatile i64 -1, i64* [[PTR]], align 8 +; CHECK-NEXT: store volatile i64 -1, i64* [[PTR]], align 8 +; CHECK-NEXT: store volatile i64 -1, i64* [[PTR]], align 8 +; CHECK-NEXT: store volatile i64 3, i64* [[PTR]], align 8 +; CHECK-NEXT: ret void +; CHECK: true2.critedge: +; CHECK-NEXT: [[VTABLE:%.*]] = bitcast [3 x i8*]* %vtable to i8* +; CHECK-NEXT: [[P:%.*]] = call i1 @llvm.type.test(i8* [[VTABLE]], metadata !"foo") +; CHECK-NEXT: tail call void @llvm.assume(i1 [[P]]) +; CHECK-NEXT: store volatile i64 0, i64* [[PTR]], align 8 +; CHECK-NEXT: store volatile i64 -1, i64* [[PTR]], align 8 +; CHECK-NEXT: store volatile i64 -1, i64* [[PTR]], align 8 +; CHECK-NEXT: store volatile i64 -1, i64* [[PTR]], align 8 +; CHECK-NEXT: store volatile i64 -1, i64* [[PTR]], align 8 +; CHECK-NEXT: store volatile i64 -1, i64* [[PTR]], align 8 +; CHECK-NEXT: store volatile i64 2, i64* [[PTR]], align 8 +; CHECK-NEXT: ret void +; + br i1 %c, label %true1, label %false1 + +true1: ; preds = %false1, %0 + %vtablei8 = bitcast [3 x i8*]* %vtable to i8* + %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"foo") + tail call void @llvm.assume(i1 %p) + store volatile i64 0, i64* %ptr, align 8 + store volatile i64 -1, i64* %ptr, align 8 + store volatile i64 -1, i64* %ptr, align 8 + store volatile i64 -1, i64* %ptr, align 8 + store volatile i64 -1, i64* %ptr, align 8 + store volatile i64 -1, i64* %ptr, align 8 + br i1 %c, label %true2, label %false2 + +false1: ; preds = %0 + store volatile i64 1, i64* %ptr, align 4 + br label %true1 + +true2: ; preds = %true1 + store volatile i64 2, i64* %ptr, align 8 + ret void + +false2: ; preds = %true1 + store volatile i64 3, i64* %ptr, align 8 + ret void +} -- 2.7.4