From 93a2c2919f7339bd9e942ef69ebbf0883a2ef727 Mon Sep 17 00:00:00 2001
From: spupyrev <spupyrev@fb.com>
Date: Thu, 2 Dec 2021 11:59:33 -0800
Subject: [PATCH] profi - a flow-based profile inference algorithm: Part III
 (out of 3)
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

This is a continuation of D109860 and D109903.

An important challenge for profile inference is caused by the fact that the
sample profile is collected on a fully optimized binary, while the block and
edge frequencies are consumed on an early stage of the compilation that operates
with a non-optimized IR. As a result, some of the basic blocks may not have
associated sample counts, and it is up to the algorithm to deduce missing
frequencies. The problem is illustrated in the figure where three basic
blocks are not present in the optimized binary and hence, receive no samples
during profiling.

We found that it is beneficial to treat all such blocks equally. Otherwise the
compiler may decide that some blocks are âcoldâ and apply undesirable
optimizations (e.g., hot-cold splitting) regressing the performance. Therefore,
we want to distribute the counts evenly along the blocks with missing samples.
This is achieved by a post-processing step that identifies "dangling" subgraphs
consisting of basic blocks with no sampled counts; once the subgraphs are
found, we rebalance the flow so as every branch probability is 50:50 within the
subgraphs.

Our experiments indicate up to 1% performance win using the optimization on
some binaries and a significant improvement in the quality of profile counts
(when compared to ground-truth instrumentation-based counts)

{F19093045}

Reviewed By: hoy

Differential Revision: https://reviews.llvm.org/D109980
---
 .../Transforms/Utils/SampleProfileInference.cpp    | 193 +++++++++++++-
 .../Inputs/profile-inference-noprobes.prof         |   8 +
 .../Inputs/profile-inference-rebalance.prof        |  27 ++
 .../SampleProfile/profile-inference-noprobes.ll    |  54 ++++
 .../SampleProfile/profile-inference-rebalance.ll   | 290 +++++++++++++++++++++
 .../Transforms/SampleProfile/profile-inference.ll  |   1 +
 6 files changed, 564 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/profile-inference-noprobes.prof
 create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/profile-inference-rebalance.prof
 create mode 100644 llvm/test/Transforms/SampleProfile/profile-inference-noprobes.ll
 create mode 100644 llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll

diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
index daf5d75..2f2dff6 100644
--- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
+++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
@@ -220,7 +220,7 @@ private:
       Now = Pred;
     }
 
-    assert(PathCapacity > 0 && "found incorrect augmenting path");
+    assert(PathCapacity > 0 && "found an incorrect augmenting path");
 
     // Update the flow along the path
     Now = Target;
@@ -271,7 +271,22 @@ private:
   uint64_t Target;
 };
 
-/// Post-processing adjustment of the control flow.
+/// A post-processing adjustment of control flow. It applies two steps by
+/// rerouting some flow and making it more realistic:
+///
+/// - First, it removes all isolated components ("islands") with a positive flow
+///   that are unreachable from the entry block. For every such component, we
+///   find the shortest from the entry to an exit passing through the component,
+///   and increase the flow by one unit along the path.
+///
+/// - Second, it identifies all "unknown subgraphs" consisting of basic blocks
+///   with no sampled counts. Then it rebalnces the flow that goes through such
+///   a subgraph so that each branch is taken with probability 50%.
+///   An unknown subgraph is such that for every two nodes u and v:
+///     - u dominates v and u is not unknown;
+///     - v post-dominates u; and
+///     - all inner-nodes of all (u,v)-paths are unknown.
+///
 class FlowAdjuster {
 public:
   FlowAdjuster(FlowFunction &Func) : Func(Func) {
@@ -281,14 +296,16 @@ public:
 
   // Run the post-processing
   void run() {
-    /// We adjust the control flow in a function so as to remove all
-    /// "isolated" components with positive flow that are unreachable
-    /// from the entry block. For every such component, we find the shortest
-    /// path from the entry to an exit passing through the component, and
-    /// increase the flow by one unit along the path.
+    /// Adjust the flow to get rid of isolated components.
     joinIsolatedComponents();
+
+    /// Rebalance the flow inside unknown subgraphs.
+    rebalanceUnknownSubgraphs();
   }
 
+  /// The probability for the first successor of a unknown subgraph
+  static constexpr double UnknownFirstSuccProbability = 0.5;
+
 private:
   void joinIsolatedComponents() {
     // Find blocks that are reachable from the source
@@ -315,7 +332,7 @@ private:
     }
   }
 
-  /// Run bfs from a given block along the jumps with a positive flow and mark
+  /// Run BFS from a given block along the jumps with a positive flow and mark
   /// all reachable blocks.
   void findReachable(uint64_t Src, std::vector<bool> &Visited) {
     if (Visited[Src])
@@ -435,6 +452,164 @@ private:
 
   uint64_t NumBlocks() const { return Func.Blocks.size(); }
 
+  /// Rebalance unknown subgraphs so as each branch splits with probabilities
+  /// UnknownFirstSuccProbability and 1 - UnknownFirstSuccProbability
+  void rebalanceUnknownSubgraphs() {
+    assert(UnknownFirstSuccProbability >= 0.0 &&
+           UnknownFirstSuccProbability <= 1.0 &&
+           "the share of the unknown successor should be between 0 and 1");
+    // Try to find unknown subgraphs from each non-unknown block
+    for (uint64_t I = 0; I < Func.Blocks.size(); I++) {
+      auto SrcBlock = &Func.Blocks[I];
+      // Do not attempt to find unknown successors from a unknown or a
+      // zero-flow block
+      if (SrcBlock->UnknownWeight || SrcBlock->Flow == 0)
+        continue;
+
+      std::vector<FlowBlock *> UnknownSuccs;
+      FlowBlock *DstBlock = nullptr;
+      // Find a unknown subgraphs starting at block SrcBlock
+      if (!findUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs))
+        continue;
+      // At the moment, we do not rebalance subgraphs containing cycles among
+      // unknown blocks
+      if (!isAcyclicSubgraph(SrcBlock, DstBlock, UnknownSuccs))
+        continue;
+
+      // Rebalance the flow
+      rebalanceUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs);
+    }
+  }
+
+  /// Find a unknown subgraph starting at block SrcBlock.
+  /// If the search is successful, the method sets DstBlock and UnknownSuccs.
+  bool findUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *&DstBlock,
+                           std::vector<FlowBlock *> &UnknownSuccs) {
+    // Run BFS from SrcBlock and make sure all paths are going through unknown
+    // blocks and end at a non-unknown DstBlock
+    auto Visited = std::vector<bool>(NumBlocks(), false);
+    std::queue<uint64_t> Queue;
+    DstBlock = nullptr;
+
+    Queue.push(SrcBlock->Index);
+    Visited[SrcBlock->Index] = true;
+    while (!Queue.empty()) {
+      auto &Block = Func.Blocks[Queue.front()];
+      Queue.pop();
+      // Process blocks reachable from Block
+      for (auto Jump : Block.SuccJumps) {
+        uint64_t Dst = Jump->Target;
+        if (Visited[Dst])
+          continue;
+        Visited[Dst] = true;
+        if (!Func.Blocks[Dst].UnknownWeight) {
+          // If we see non-unique non-unknown block reachable from SrcBlock,
+          // stop processing and skip rebalancing
+          FlowBlock *CandidateDstBlock = &Func.Blocks[Dst];
+          if (DstBlock != nullptr && DstBlock != CandidateDstBlock)
+            return false;
+          DstBlock = CandidateDstBlock;
+        } else {
+          Queue.push(Dst);
+          UnknownSuccs.push_back(&Func.Blocks[Dst]);
+        }
+      }
+    }
+
+    // If the list of unknown blocks is empty, we don't need rebalancing
+    if (UnknownSuccs.empty())
+      return false;
+    // If all reachable nodes from SrcBlock are unknown, skip rebalancing
+    if (DstBlock == nullptr)
+      return false;
+    // If any of the unknown blocks is an exit block, skip rebalancing
+    for (auto Block : UnknownSuccs) {
+      if (Block->isExit())
+        return false;
+    }
+
+    return true;
+  }
+
+  /// Verify if the given unknown subgraph is acyclic, and if yes, reorder
+  /// UnknownSuccs in the topological order (so that all jumps are "forward").
+  bool isAcyclicSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock,
+                         std::vector<FlowBlock *> &UnknownSuccs) {
+    // Extract local in-degrees in the considered subgraph
+    auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0);
+    for (auto Jump : SrcBlock->SuccJumps) {
+      LocalInDegree[Jump->Target]++;
+    }
+    for (uint64_t I = 0; I < UnknownSuccs.size(); I++) {
+      for (auto Jump : UnknownSuccs[I]->SuccJumps) {
+        LocalInDegree[Jump->Target]++;
+      }
+    }
+    // A loop containing SrcBlock
+    if (LocalInDegree[SrcBlock->Index] > 0)
+      return false;
+
+    std::vector<FlowBlock *> AcyclicOrder;
+    std::queue<uint64_t> Queue;
+    Queue.push(SrcBlock->Index);
+    while (!Queue.empty()) {
+      auto &Block = Func.Blocks[Queue.front()];
+      Queue.pop();
+      // Stop propagation once we reach DstBlock
+      if (Block.Index == DstBlock->Index)
+        break;
+
+      AcyclicOrder.push_back(&Block);
+      // Add to the queue all successors with zero local in-degree
+      for (auto Jump : Block.SuccJumps) {
+        uint64_t Dst = Jump->Target;
+        LocalInDegree[Dst]--;
+        if (LocalInDegree[Dst] == 0) {
+          Queue.push(Dst);
+        }
+      }
+    }
+
+    // If there is a cycle in the subgraph, AcyclicOrder contains only a subset
+    // of all blocks
+    if (UnknownSuccs.size() + 1 != AcyclicOrder.size())
+      return false;
+    UnknownSuccs = AcyclicOrder;
+    return true;
+  }
+
+  /// Rebalance a given subgraph.
+  void rebalanceUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock,
+                                std::vector<FlowBlock *> &UnknownSuccs) {
+    assert(SrcBlock->Flow > 0 && "zero-flow block in unknown subgraph");
+    assert(UnknownSuccs.front() == SrcBlock && "incorrect order of unknowns");
+
+    for (auto Block : UnknownSuccs) {
+      // Block's flow is the sum of incoming flows
+      uint64_t TotalFlow = 0;
+      if (Block == SrcBlock) {
+        TotalFlow = Block->Flow;
+      } else {
+        for (auto Jump : Block->PredJumps) {
+          TotalFlow += Jump->Flow;
+        }
+        Block->Flow = TotalFlow;
+      }
+
+      // Process all successor jumps and update corresponding flow values
+      for (uint64_t I = 0; I < Block->SuccJumps.size(); I++) {
+        auto Jump = Block->SuccJumps[I];
+        if (I + 1 == Block->SuccJumps.size()) {
+          Jump->Flow = TotalFlow;
+          continue;
+        }
+        uint64_t Flow = uint64_t(TotalFlow * UnknownFirstSuccProbability);
+        Jump->Flow = Flow;
+        TotalFlow -= Flow;
+      }
+    }
+  }
+
   /// A constant indicating an arbitrary exit block of a function.
   static constexpr uint64_t AnyExitBlock = uint64_t(-1);
 
@@ -622,7 +797,7 @@ void verifyWeights(const FlowFunction &Func) {
     }
   }
 
-  // Run bfs from the source along edges with positive flow
+  // Run BFS from the source along edges with positive flow
   std::queue<uint64_t> Queue;
   auto Visited = std::vector<bool>(NumBlocks, false);
   Queue.push(Func.Entry);
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-noprobes.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-noprobes.prof
new file mode 100644
index 0000000..55c2bec
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-noprobes.prof
@@ -0,0 +1,8 @@
+test_4:100:13293
+ 0: 100
+ 1: 100
+ 2: 40
+
+empty:100:13293
+ 0: 7
+ 1: 100
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-rebalance.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-rebalance.prof
new file mode 100644
index 0000000..802f1b3
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-rebalance.prof
@@ -0,0 +1,27 @@
+countMultipliers:37078302:0
+ 2: 65536
+ 3: 10
+ 4: 65536
+ 7: 65536
+ 9: 65536
+ 10: 65536
+ !CFGChecksum: 223598586707
+
+countMultipliers2:37078302:0
+ 1: 2100
+ 2: 2000
+ 6: 2100
+ !CFGChecksum: 2235985
+
+countMultipliers3:37078302:0
+ 1: 100
+ 2: 100
+ 3: 100
+ !CFGChecksum: 22985
+
+countMultipliers4:37078302:0
+ 1: 100
+ 2: 50
+ 3: 50
+ 5: 100
+ !CFGChecksum: 2298578
diff --git a/llvm/test/Transforms/SampleProfile/profile-inference-noprobes.ll b/llvm/test/Transforms/SampleProfile/profile-inference-noprobes.ll
new file mode 100644
index 0000000..c270644
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/profile-inference-noprobes.ll
@@ -0,0 +1,54 @@
+; RUN: opt < %s -passes=sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference-noprobes.prof -S | FileCheck %s
+; RUN: opt < %s -passes=sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference-noprobes.prof | opt -analyze -block-freq  -enable-new-pm=0 | FileCheck %s --check-prefix=CHECK2
+
+
+; The test verifies that profile inference can be applied for non-probe-based
+; profiles.
+;
+; +---------+     +----------+
+; | b3 [40] | <-- | b1 [100] |
+; +---------+     +----------+
+;                   |
+;                   |
+;                   v
+;                 +----------+
+;                 | b2 [0]   |
+;                 +----------+
+
+@yydebug = dso_local global i32 0, align 4
+
+define void @test_4() #0 !dbg !4 {
+;entry:
+;  ret void, !dbg !9
+b1:
+  %0 = load i32, i32* @yydebug, align 4
+  %cmp = icmp ne i32 %0, 0, !dbg !9
+  br i1 %cmp, label %b2, label %b3, !dbg !9
+; CHECK2: - b1: float = {{.*}}, int = {{.*}}, count = 100
+
+b2:
+  ret void
+; CHECK2: - b2: float = {{.*}}, int = {{.*}}, count = 60
+
+b3:
+  ret void, !dbg !10
+; CHECK2: - b3: float = {{.*}}, int = {{.*}}, count = 40
+}
+
+; CHECK: {{.*}} = !{!"function_entry_count", i64 100}
+; CHECK: {{.*}} = !{!"branch_weights", i32 60, i32 40}
+
+attributes #0 = { noinline nounwind uwtable "use-sample-profile"}
+
+!llvm.module.flags = !{!6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0 (trunk 237249) (llvm/trunk 237261)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "entry_counts.c", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test_4", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: false, unit: !0, retainedNodes: !2)
+!5 = !DISubroutineType(types: !2)
+!6 = !{i32 2, !"Dwarf Version", i32 4}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{!"clang version 3.7.0 (trunk 237249) (llvm/trunk 237261)"}
+!9 = !DILocation(line: 1, column: 15, scope: !4)
+!10 = !DILocation(line: 3, column: 15, scope: !4)
diff --git a/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll b/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll
new file mode 100644
index 0000000..36d2cdb
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll
@@ -0,0 +1,290 @@
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference-rebalance.prof | opt -analyze -branch-prob -enable-new-pm=0 | FileCheck %s
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference-rebalance.prof | opt -analyze -block-freq  -enable-new-pm=0 | FileCheck %s --check-prefix=CHECK2
+
+; The test contains a "dimanond" and a "triangle" that needs to be rebalanced
+; after basic profile inference.
+;
+;                  +----------------+
+;                  |    b11 [?]     |
+;                  +----------------+
+;                    |
+;                    v
+; +----------+     +----------------+
+; | b13 [10] | <-- | b12 [65536]    |
+; +----------+     +----------------+
+;                    |
+;                    v
+; +----------+     +----------------+
+; | b16 [?]  | <-- | b14 [65536]    |
+; +----------+     +----------------+
+;   |                |
+;   |                v
+;   |              +----------------+
+;   |              |    b15 [?]     |
+;   |              +----------------+
+;   |                |
+;   |                v
+;   |              +----------------+
+;   +------------> | b17 [65536]  | -+
+;                  +----------------+  |
+;                    |                 |
+;                    v                 |
+;                  +----------------+  |
+;                  |    b18 [?]     |  |
+;                  +----------------+  |
+;                    |                 |
+;                    v                 |
+;                  +----------------+  |
+;                  | b19 [65536]    | <+
+;                  +----------------+
+;                    |
+;                    v
+;                  +----------------+
+;                  | b110 [65536]   |
+;                  +----------------+
+
+@yydebug = dso_local global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @countMultipliers(i32 %0, i32 %1) #0 {
+b11:
+  call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 1, i32 0, i64 -1)
+  %cmp = icmp ne i32 %0, 0
+  br label %b12
+; CHECK2: - b11: float = {{.*}}, int = {{.*}}, count = 65546
+
+b12:
+  call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 2, i32 0, i64 -1)
+  br i1 %cmp, label %b14, label %b13
+; CHECK2: - b12: float = {{.*}}, int = {{.*}}, count = 65546
+
+b13:
+  call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 3, i32 0, i64 -1)
+  ret i32 %1
+; CHECK2: - b13: float = {{.*}}, int = {{.*}}, count = 10
+
+b14:
+  call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 4, i32 0, i64 -1)
+  br i1 %cmp, label %b15, label %b16
+; CHECK:  edge b14 -> b15 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK:  edge b14 -> b16 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK2: - b14: float = {{.*}}, int = {{.*}}, count = 65536
+
+b15:
+  call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 5, i32 0, i64 -1)
+  br label %b17
+; CHECK2: - b15: float = {{.*}}, int = {{.*}}, count = 32768
+
+b16:
+  call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 6, i32 0, i64 -1)
+  br label %b17
+; CHECK2: - b16: float = {{.*}}, int = {{.*}}, count = 32768
+
+b17:
+  call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 7, i32 0, i64 -1)
+  br i1 %cmp, label %b18, label %b19
+; CHECK:  edge b17 -> b18 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK:  edge b17 -> b19 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK2: - b17: float = {{.*}}, int = {{.*}}, count = 65536
+
+b18:
+  call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 8, i32 0, i64 -1)
+  br label %b19
+; CHECK2: - b18: float = {{.*}}, int = {{.*}}, count = 32768
+
+b19:
+  call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 9, i32 0, i64 -1)
+  br label %b110
+; CHECK2: - b19: float = {{.*}}, int = {{.*}}, count = 65536
+
+b110:
+  call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 10, i32 0, i64 -1)
+  ret i32 %1
+; CHECK2: - b110: float = {{.*}}, int = {{.*}}, count = 65536
+}
+
+
+; The test contains a triangle comprised of dangling blocks.
+;
+;                     +-----------+
+;                     | b0 [2100] | -+
+;                     +-----------+  |
+;                       |            |
+;                       |            |
+;                       v            |
+;                     +-----------+  |
+;                  +- | b1 [2000] |  |
+;                  |  +-----------+  |
+;                  |    |            |
+;                  |    |            |
+;                  |    v            |
+; +--------+       |  +-----------+  |
+; | b4 [?] | <-----+- |  b2 [?]   |  |
+; +--------+       |  +-----------+  |
+;   |              |    |            |
+;   |              |    |            |
+;   |              |    v            |
+;   |              |  +-----------+  |
+;   |              +> |  b3 [?]   |  |
+;   |                 +-----------+  |
+;   |                   |            |
+;   |                   |            |
+;   |                   v            |
+;   |                 +-----------+  |
+;   +---------------> | b5 [2100] | <+
+;                     +-----------+
+
+define dso_local i32 @countMultipliers2(i32 %0, i32 %1) #0 {
+b0:
+  call void @llvm.pseudoprobe(i64 2506109673213838996, i64 1, i32 0, i64 -1)
+  %cmp = icmp ne i32 %0, 0
+  br i1 %cmp, label %b1, label %b5
+; CHECK:  edge b0 -> b1 probability is 0x79e79e7a / 0x80000000 = 95.24% [HOT edge]
+; CHECK:  edge b0 -> b5 probability is 0x06186186 / 0x80000000 = 4.76%
+; CHECK2: - b0: float = {{.*}}, int = {{.*}}, count = 2100
+
+b1:
+  call void @llvm.pseudoprobe(i64 2506109673213838996, i64 2, i32 0, i64 -1)
+  br i1 %cmp, label %b2, label %b3
+; CHECK:  edge b1 -> b2 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK:  edge b1 -> b3 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK2: - b1: float = {{.*}}, int = {{.*}}, count = 1973
+
+b2:
+  call void @llvm.pseudoprobe(i64 2506109673213838996, i64 3, i32 0, i64 -1)
+  br i1 %cmp, label %b3, label %b4
+; CHECK:  edge b2 -> b3 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK:  edge b2 -> b4 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK2: - b2: float = {{.*}}, int = {{.*}}, count = 955
+
+b3:
+  call void @llvm.pseudoprobe(i64 2506109673213838996, i64 4, i32 0, i64 -1)
+  br label %b5
+; CHECK:  edge b3 -> b5 probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+; CHECK2: - b3: float = {{.*}}, int = {{.*}}, count = 1527
+
+b4:
+  call void @llvm.pseudoprobe(i64 2506109673213838996, i64 5, i32 0, i64 -1)
+  br label %b5
+; CHECK:  edge b4 -> b5 probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+; CHECK2: - b4: float = {{.*}}, int = {{.*}}, count = 445
+
+b5:
+  call void @llvm.pseudoprobe(i64 2506109673213838996, i64 6, i32 0, i64 -1)
+  ret i32 %1
+; CHECK2: - b5: float = {{.*}}, int = {{.*}}, count = 2100
+
+}
+
+
+; The test contains a dangling subgraph that contains an exit dangling block.
+; No rebalancing is necessary here.
+;
+;                 +-----------+
+;                 | b31 [100] |
+;                 +-----------+
+;                   |
+;                   |
+;                   v
+; +---------+     +-----------+
+; | b34 [?] | <-- | b32 [100] |
+; +---------+     +-----------+
+;                   |
+;                   |
+;                   v
+;                 +-----------+
+;                 | b33 [100] |
+;                 +-----------+
+
+define dso_local i32 @countMultipliers3(i32 %0, i32 %1) #0 {
+b31:
+  call void @llvm.pseudoprobe(i64 -544905447084884130, i64 1, i32 0, i64 -1)
+  br label %b32
+; CHECK2: - b31: float = {{.*}}, int = {{.*}}, count = 100
+
+b32:
+  call void @llvm.pseudoprobe(i64 -544905447084884130, i64 2, i32 0, i64 -1)
+  %cmp = icmp ne i32 %0, 0
+  br i1 %cmp, label %b34, label %b33
+; CHECK:  edge b32 -> b34 probability is 0x00000000 / 0x80000000 = 0.00%
+; CHECK:  edge b32 -> b33 probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+; CHECK2: - b32: float = {{.*}}, int = {{.*}}, count = 100
+
+b33:
+  call void @llvm.pseudoprobe(i64 -544905447084884130, i64 3, i32 0, i64 -1)
+  ret i32 %1
+; CHECK2: - b33: float = {{.*}}, int = {{.*}}, count = 100
+
+b34:
+  call void @llvm.pseudoprobe(i64 -544905447084884130, i64 4, i32 0, i64 -1)
+  ret i32 %1
+; CHECK2: - b34: float = {{.*}}, int = {{.*}}, count = 0
+
+}
+
+; Another dangling subgraph (b42, b43, b44) containing a single dangling block.
+;
+;      +----------+     +-----------+
+;   +- | b42 [50] | <-- | b40 [100] |
+;   |  +----------+     +-----------+
+;   |    |                |
+;   |    |                |
+;   |    |                v
+;   |    |              +-----------+
+;   |    |              | b41 [50]  |
+;   |    |              +-----------+
+;   |    |                |
+;   |    |                |
+;   |    |                v
+;   |    |              +-----------+
+;   |    +------------> |  b43 [?]  |
+;   |                   +-----------+
+;   |                     |
+;   |                     |
+;   |                     v
+;   |                   +-----------+
+;   +-----------------> | b44 [100] |
+;                       +-----------+
+
+define dso_local i32 @countMultipliers4(i32 %0, i32 %1) #0 {
+b40:
+  call void @llvm.pseudoprobe(i64 -2989539179265513123, i64 1, i32 0, i64 -1)
+  %cmp = icmp ne i32 %0, 0
+  br i1 %cmp, label %b41, label %b42
+; CHECK2: - b40: float = {{.*}}, int = {{.*}}, count = 100
+
+b41:
+  call void @llvm.pseudoprobe(i64 -2989539179265513123, i64 2, i32 0, i64 -1)
+  br label %b43
+; CHECK2: - b41: float = {{.*}}, int = {{.*}}, count = 50
+
+b42:
+  call void @llvm.pseudoprobe(i64 -2989539179265513123, i64 3, i32 0, i64 -1)
+  br i1 %cmp, label %b43, label %b44
+; CHECK:  edge b42 -> b43 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK:  edge b42 -> b44 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK2: - b42: float = {{.*}}, int = {{.*}}, count = 50
+
+b43:
+  call void @llvm.pseudoprobe(i64 -2989539179265513123, i64 4, i32 0, i64 -1)
+  br label %b44
+; CHECK2: - b43: float = {{.*}}, int = {{.*}}, count = 75
+
+b44:
+  call void @llvm.pseudoprobe(i64 -2989539179265513123, i64 5, i32 0, i64 -1)
+  ret i32 %1
+; CHECK2: - b44: float = {{.*}}, int = {{.*}}, count = 100
+}
+
+; Function Attrs: inaccessiblememonly nounwind willreturn
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #4
+
+attributes #0 = { noinline nounwind uwtable "use-sample-profile" }
+attributes #4 = { inaccessiblememonly nounwind willreturn }
+
+!llvm.pseudo_probe_desc = !{!7, !8, !9, !10}
+
+!7 = !{i64 -5758218299531803684, i64 223598586707, !"countMultipliers", null}
+!8 = !{i64 2506109673213838996, i64 2235985, !"countMultipliers2", null}
+!9 = !{i64 -544905447084884130, i64 22985, !"countMultipliers3", null}
+!10 = !{i64 -2989539179265513123, i64 2298578, !"countMultipliers4", null}
diff --git a/llvm/test/Transforms/SampleProfile/profile-inference.ll b/llvm/test/Transforms/SampleProfile/profile-inference.ll
index 7f40358..7424a74 100644
--- a/llvm/test/Transforms/SampleProfile/profile-inference.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-inference.ll
@@ -232,6 +232,7 @@ b9:
 }
 ; CHECK2: - b9: float = {{.*}}, int = {{.*}}, count = 5993
 
+
 declare void @llvm.pseudoprobe(i64, i64, i32, i64) #1
 
 attributes #0 = { noinline nounwind uwtable "use-sample-profile"}
-- 
2.7.4