From 220f6e5271f2e6b39bf4e083c03cd3f91bb43685 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Wed, 28 Apr 2021 15:20:04 -0700
Subject: [PATCH] [SimplifyCFG] Ignore ephemeral values when counting insts for
 threading

Ignore ephemeral values (only feeding llvm.assume intrinsics) when
computing the instruction count to decide if a block is small enough for
threading. This is similar to the handling of these values in the
InlineCost computation. These instructions will eventually be removed
and shouldn't count against code size (similar to the existing ignoring
of phis).

Without this change, when enabling -fwhole-program-vtables, which causes
type test / assume sequences to be inserted by clang, we can get
different threading decisions. In particular, when building with
instrumentation FDO it can affect the optimizations decisions before FDO
matching, leading to some mismatches.

Differential Revision: https://reviews.llvm.org/D101494
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp          | 32 +++++++----
 .../test/Transforms/SimplifyCFG/unprofitable-pr.ll | 63 +++++++++++++++++++++-
 2 files changed, 84 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index be108c0..7e96cc8 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -151,9 +151,10 @@ static cl::opt<unsigned> MaxSpeculationDepth(
              "speculatively executed instructions"));
 
 static cl::opt<int>
-MaxSmallBlockSize("simplifycfg-max-small-block-size", cl::Hidden, cl::init(10),
-                  cl::desc("Max size of a block which is still considered "
-                           "small enough to thread through"));
+    MaxSmallBlockSize("simplifycfg-max-small-block-size", cl::Hidden,
+                      cl::init(10),
+                      cl::desc("Max size of a block which is still considered "
+                               "small enough to thread through"));
 
 // Two is chosen to allow one negation and a logical combine.
 static cl::opt<unsigned>
@@ -2527,19 +2528,32 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
 static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
   int Size = 0;
 
-  for (Instruction &I : BB->instructionsWithoutDebug()) {
-    if (Size > MaxSmallBlockSize)
-      return false; // Don't clone large BB's.
+  SmallPtrSet<const Value *, 32> EphValues;
+  auto IsEphemeral = [&](const Value *V) {
+    if (isa<AssumeInst>(V))
+      return true;
+    return isSafeToSpeculativelyExecute(V) &&
+           all_of(V->users(),
+                  [&](const User *U) { return EphValues.count(U); });
+  };
 
+  // Walk the loop in reverse so that we can identify ephemeral values properly
+  // (values only feeding assumes).
+  for (Instruction &I : reverse(BB->instructionsWithoutDebug())) {
     // Can't fold blocks that contain noduplicate or convergent calls.
     if (CallInst *CI = dyn_cast<CallInst>(&I))
       if (CI->cannotDuplicate() || CI->isConvergent())
         return false;
 
+    // Ignore ephemeral values which are deleted during codegen.
+    if (IsEphemeral(&I))
+      EphValues.insert(&I);
     // We will delete Phis while threading, so Phis should not be accounted in
-    // block's size
-    if (!isa<PHINode>(I))
-      ++Size;
+    // block's size.
+    else if (!isa<PHINode>(I)) {
+      if (Size++ > MaxSmallBlockSize)
+        return false; // Don't clone large BB's.
+    }
 
     // We can only support instructions that do not define values that are
     // live outside of the current basic block.
diff --git a/llvm/test/Transforms/SimplifyCFG/unprofitable-pr.ll b/llvm/test/Transforms/SimplifyCFG/unprofitable-pr.ll
index 283922f..8801d57 100644
--- a/llvm/test/Transforms/SimplifyCFG/unprofitable-pr.ll
+++ b/llvm/test/Transforms/SimplifyCFG/unprofitable-pr.ll
@@ -1,10 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -simplifycfg-max-small-block-size=10 -S < %s | FileCheck %s
-; RUN: opt -passes=simplify-cfg -simplifycfg-max-small-block-size=10 -S < %s | FileCheck %s
+; RUN: opt -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -simplifycfg-max-small-block-size=6 -S < %s | FileCheck %s
+; RUN: opt -passes=simplify-cfg -simplifycfg-max-small-block-size=6 -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64-p5:32:32-A5"
 
 declare void @llvm.assume(i1)
+declare i1 @llvm.type.test(i8*, metadata) nounwind readnone
 
 define void @test_01(i1 %c, i64* align 1 %ptr) local_unnamed_addr #0 {
 ; CHECK-LABEL: @test_01(
@@ -165,3 +166,61 @@ false2:                                           ; preds = %true1
   store volatile i64 3, i64* %ptr, align 8
   ret void
 }
+
+; Try the max block size for PRE again but with the bitcast/type test/assume
+; sequence used for whole program devirt.
+define void @test_04(i1 %c, i64* align 1 %ptr, [3 x i8*]* %vtable) local_unnamed_addr #0 {
+; CHECK-LABEL: @test_04(
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[TRUE2_CRITEDGE:%.*]], label [[FALSE1:%.*]]
+; CHECK:       false1:
+; CHECK-NEXT:    store volatile i64 1, i64* [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[VTABLE:%.*]] = bitcast [3 x i8*]* %vtable to i8*
+; CHECK-NEXT:    [[P:%.*]] = call i1 @llvm.type.test(i8* [[VTABLE]], metadata !"foo")
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[P]])
+; CHECK-NEXT:    store volatile i64 0, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 3, i64* [[PTR]], align 8
+; CHECK-NEXT:    ret void
+; CHECK:       true2.critedge:
+; CHECK-NEXT:    [[VTABLE:%.*]] = bitcast [3 x i8*]* %vtable to i8*
+; CHECK-NEXT:    [[P:%.*]] = call i1 @llvm.type.test(i8* [[VTABLE]], metadata !"foo")
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[P]])
+; CHECK-NEXT:    store volatile i64 0, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 2, i64* [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  br i1 %c, label %true1, label %false1
+
+true1:                                            ; preds = %false1, %0
+  %vtablei8 = bitcast [3 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"foo")
+  tail call void @llvm.assume(i1 %p)
+  store volatile i64 0, i64* %ptr, align 8
+  store volatile i64 -1, i64* %ptr, align 8
+  store volatile i64 -1, i64* %ptr, align 8
+  store volatile i64 -1, i64* %ptr, align 8
+  store volatile i64 -1, i64* %ptr, align 8
+  store volatile i64 -1, i64* %ptr, align 8
+  br i1 %c, label %true2, label %false2
+
+false1:                                           ; preds = %0
+  store volatile i64 1, i64* %ptr, align 4
+  br label %true1
+
+true2:                                            ; preds = %true1
+  store volatile i64 2, i64* %ptr, align 8
+  ret void
+
+false2:                                           ; preds = %true1
+  store volatile i64 3, i64* %ptr, align 8
+  ret void
+}
-- 
2.7.4