profi - a flow-based profile inference algorithm: Part I (out of 3)

author spupyrev <spupyrev@fb.com>

Wed, 1 Dec 2021 23:29:35 +0000 (15:29 -0800)

committer Hongtao Yu <hoy@fb.com>

Wed, 1 Dec 2021 23:30:38 +0000 (15:30 -0800)
author spupyrev <spupyrev@fb.com>
Wed, 1 Dec 2021 23:29:35 +0000 (15:29 -0800)
committer Hongtao Yu <hoy@fb.com>
Wed, 1 Dec 2021 23:30:38 +0000 (15:30 -0800)
diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h

new file mode 100644 (file)

index 0000000..e1f681b
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h
@@ -0,0 +1,284 @@
+//===- Transforms/Utils/SampleProfileInference.h ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file provides the interface for the profile inference algorithm, profi.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_SAMPLEPROFILEINFERENCE_H
+#define LLVM_TRANSFORMS_UTILS_SAMPLEPROFILEINFERENCE_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+
+namespace llvm {
+
+class BasicBlock;
+class Function;
+class MachineBasicBlock;
+class MachineFunction;
+
+namespace afdo_detail {
+
+template <class BlockT> struct TypeMap {};
+template <> struct TypeMap<BasicBlock> {
+  using BasicBlockT = BasicBlock;
+  using FunctionT = Function;
+};
+template <> struct TypeMap<MachineBasicBlock> {
+  using BasicBlockT = MachineBasicBlock;
+  using FunctionT = MachineFunction;
+};
+
+} // end namespace afdo_detail
+
+struct FlowJump;
+
+/// A wrapper of a binary basic block.
+struct FlowBlock {
+  uint64_t Index;
+  uint64_t Weight{0};
+  bool UnknownWeight{false};
+  uint64_t Flow{0};
+  bool HasSelfEdge{false};
+  std::vector<FlowJump *> SuccJumps;
+  std::vector<FlowJump *> PredJumps;
+
+  /// Check if it is the entry block in the function.
+  bool isEntry() const { return PredJumps.empty(); }
+
+  /// Check if it is an exit block in the function.
+  bool isExit() const { return SuccJumps.empty(); }
+};
+
+/// A wrapper of a jump between two basic blocks.
+struct FlowJump {
+  uint64_t Source;
+  uint64_t Target;
+  uint64_t Flow{0};
+  bool IsUnlikely{false};
+};
+
+/// A wrapper of binary function with basic blocks and jumps.
+struct FlowFunction {
+  std::vector<FlowBlock> Blocks;
+  std::vector<FlowJump> Jumps;
+  /// The index of the entry block.
+  uint64_t Entry;
+};
+
+void applyFlowInference(FlowFunction &Func);
+
+/// Sample profile inference pass.
+template <typename BT> class SampleProfileInference {
+public:
+  using BasicBlockT = typename afdo_detail::TypeMap<BT>::BasicBlockT;
+  using FunctionT = typename afdo_detail::TypeMap<BT>::FunctionT;
+  using Edge = std::pair<const BasicBlockT *, const BasicBlockT *>;
+  using BlockWeightMap = DenseMap<const BasicBlockT *, uint64_t>;
+  using EdgeWeightMap = DenseMap<Edge, uint64_t>;
+  using BlockEdgeMap =
+      DenseMap<const BasicBlockT *, SmallVector<const BasicBlockT *, 8>>;
+
+  SampleProfileInference(FunctionT &F, BlockEdgeMap &Successors,
+                         BlockWeightMap &SampleBlockWeights)
+      : F(F), Successors(Successors), SampleBlockWeights(SampleBlockWeights) {}
+
+  /// Apply the profile inference algorithm for a given function
+  void apply(BlockWeightMap &BlockWeights, EdgeWeightMap &EdgeWeights);
+
+private:
+  /// Try to infer branch probabilities mimicking implementation of
+  /// BranchProbabilityInfo. Unlikely taken branches are marked so that the
+  /// inference algorithm can avoid sending flow along corresponding edges.
+  void findUnlikelyJumps(const std::vector<const BasicBlockT *> &BasicBlocks,
+                         BlockEdgeMap &Successors, FlowFunction &Func);
+
+  /// Determine whether the block is an exit in the CFG.
+  bool isExit(const BasicBlockT *BB);
+
+  /// Function.
+  const FunctionT &F;
+
+  /// Successors for each basic block in the CFG.
+  BlockEdgeMap &Successors;
+
+  /// Map basic blocks to their sampled weights.
+  BlockWeightMap &SampleBlockWeights;
+};
+
+template <typename BT>
+void SampleProfileInference<BT>::apply(BlockWeightMap &BlockWeights,
+                                       EdgeWeightMap &EdgeWeights) {
+  // Find all forwards reachable blocks which the inference algorithm will be
+  // applied on.
+  df_iterator_default_set<const BasicBlockT *> Reachable;
+  for (auto *BB : depth_first_ext(&F, Reachable))
+    (void)BB /* Mark all reachable blocks */;
+
+  // Find all backwards reachable blocks which the inference algorithm will be
+  // applied on.
+  df_iterator_default_set<const BasicBlockT *> InverseReachable;
+  for (const auto &BB : F) {
+    // An exit block is a block without any successors.
+    if (isExit(&BB)) {
+      for (auto *RBB : inverse_depth_first_ext(&BB, InverseReachable))
+        (void)RBB;
+    }
+  }
+
+  // Keep a stable order for reachable blocks
+  DenseMap<const BasicBlockT *, uint64_t> BlockIndex;
+  std::vector<const BasicBlockT *> BasicBlocks;
+  BlockIndex.reserve(Reachable.size());
+  BasicBlocks.reserve(Reachable.size());
+  for (const auto &BB : F) {
+    if (Reachable.count(&BB) && InverseReachable.count(&BB)) {
+      BlockIndex[&BB] = BasicBlocks.size();
+      BasicBlocks.push_back(&BB);
+    }
+  }
+
+  BlockWeights.clear();
+  EdgeWeights.clear();
+  bool HasSamples = false;
+  for (const auto *BB : BasicBlocks) {
+    auto It = SampleBlockWeights.find(BB);
+    if (It != SampleBlockWeights.end() && It->second > 0) {
+      HasSamples = true;
+      BlockWeights[BB] = It->second;
+    }
+  }
+  // Quit early for functions with a single block or ones w/o samples
+  if (BasicBlocks.size() <= 1 || !HasSamples) {
+    return;
+  }
+
+  // Create necessary objects
+  FlowFunction Func;
+  Func.Blocks.reserve(BasicBlocks.size());
+  // Create FlowBlocks
+  for (const auto *BB : BasicBlocks) {
+    FlowBlock Block;
+    if (SampleBlockWeights.find(BB) != SampleBlockWeights.end()) {
+      Block.UnknownWeight = false;
+      Block.Weight = SampleBlockWeights[BB];
+    } else {
+      Block.UnknownWeight = true;
+      Block.Weight = 0;
+    }
+    Block.Index = Func.Blocks.size();
+    Func.Blocks.push_back(Block);
+  }
+  // Create FlowEdges
+  for (const auto *BB : BasicBlocks) {
+    for (auto *Succ : Successors[BB]) {
+      if (!BlockIndex.count(Succ))
+        continue;
+      FlowJump Jump;
+      Jump.Source = BlockIndex[BB];
+      Jump.Target = BlockIndex[Succ];
+      Func.Jumps.push_back(Jump);
+      if (BB == Succ) {
+        Func.Blocks[BlockIndex[BB]].HasSelfEdge = true;
+      }
+    }
+  }
+  for (auto &Jump : Func.Jumps) {
+    Func.Blocks[Jump.Source].SuccJumps.push_back(&Jump);
+    Func.Blocks[Jump.Target].PredJumps.push_back(&Jump);
+  }
+
+  // Try to infer probabilities of jumps based on the content of basic block
+  findUnlikelyJumps(BasicBlocks, Successors, Func);
+
+  // Find the entry block
+  for (size_t I = 0; I < Func.Blocks.size(); I++) {
+    if (Func.Blocks[I].isEntry()) {
+      Func.Entry = I;
+      break;
+    }
+  }
+
+  // Create and apply the inference network model.
+  applyFlowInference(Func);
+
+  // Extract the resulting weights from the control flow
+  // All weights are increased by one to avoid propagation errors introduced by
+  // zero weights.
+  for (const auto *BB : BasicBlocks) {
+    BlockWeights[BB] = Func.Blocks[BlockIndex[BB]].Flow;
+  }
+  for (auto &Jump : Func.Jumps) {
+    Edge E = std::make_pair(BasicBlocks[Jump.Source], BasicBlocks[Jump.Target]);
+    EdgeWeights[E] = Jump.Flow;
+  }
+
+#ifndef NDEBUG
+  // Unreachable blocks and edges should not have a weight.
+  for (auto &I : BlockWeights) {
+    assert(Reachable.contains(I.first));
+    assert(InverseReachable.contains(I.first));
+  }
+  for (auto &I : EdgeWeights) {
+    assert(Reachable.contains(I.first.first) &&
+           Reachable.contains(I.first.second));
+    assert(InverseReachable.contains(I.first.first) &&
+           InverseReachable.contains(I.first.second));
+  }
+#endif
+}
+
+template <typename BT>
+inline void SampleProfileInference<BT>::findUnlikelyJumps(
+    const std::vector<const BasicBlockT *> &BasicBlocks,
+    BlockEdgeMap &Successors, FlowFunction &Func) {}
+
+template <>
+inline void SampleProfileInference<BasicBlock>::findUnlikelyJumps(
+    const std::vector<const BasicBlockT *> &BasicBlocks,
+    BlockEdgeMap &Successors, FlowFunction &Func) {
+  for (auto &Jump : Func.Jumps) {
+    const auto *BB = BasicBlocks[Jump.Source];
+    const auto *Succ = BasicBlocks[Jump.Target];
+    const Instruction *TI = BB->getTerminator();
+    // Check if a block ends with InvokeInst and mark non-taken branch unlikely.
+    // In that case block Succ should be a landing pad
+    if (Successors[BB].size() == 2 && Successors[BB].back() == Succ) {
+      if (isa<InvokeInst>(TI)) {
+        Jump.IsUnlikely = true;
+      }
+    }
+    const Instruction *SuccTI = Succ->getTerminator();
+    // Check if the target block contains UnreachableInst and mark it unlikely
+    if (SuccTI->getNumSuccessors() == 0) {
+      if (isa<UnreachableInst>(SuccTI)) {
+        Jump.IsUnlikely = true;
+      }
+    }
+  }
+}
+
+template <typename BT>
+inline bool SampleProfileInference<BT>::isExit(const BasicBlockT *BB) {
+  return BB->succ_empty();
+}
+
+template <>
+inline bool SampleProfileInference<BasicBlock>::isExit(const BasicBlock *BB) {
+  return succ_empty(BB);
+}
+
+} // end namespace llvm
+#endif // LLVM_TRANSFORMS_UTILS_SAMPLEPROFILEINFERENCE_H
diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h

index 6a2f0acf46f326b3df1dbc8542e144e649ab1845..175bdde7fd059f6bd080231bbc0bfe8cfaad8298 100644 (file)
--- a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
@@ -38,6 +38,7 @@
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/GenericDomTree.h"
  #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/SampleProfileInference.h"
  #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h"
  
  namespace llvm {
@@ -74,6 +75,8 @@ template <> struct IRTraits<BasicBlock> {
  
  } // end namespace afdo_detail
  
+extern cl::opt<bool> SampleProfileUseProfi;
+
  template <typename BT> class SampleProfileLoaderBaseImpl {
  public:
    SampleProfileLoaderBaseImpl(std::string Name, std::string RemapName)
@@ -142,6 +145,9 @@ protected:
                             ArrayRef<BasicBlockT *> Descendants,
                             PostDominatorTreeT *DomTree);
    void propagateWeights(FunctionT &F);
+  void applyProfi(FunctionT &F, BlockEdgeMap &Successors,
+                  BlockWeightMap &SampleBlockWeights,
+                  BlockWeightMap &BlockWeights, EdgeWeightMap &EdgeWeights);
    uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
    void buildEdges(FunctionT &F);
    bool propagateThroughEdges(FunctionT &F, bool UpdateBlockCount);
@@ -150,6 +156,11 @@ protected:
    bool
    computeAndPropagateWeights(FunctionT &F,
                               const DenseSet<GlobalValue::GUID> &InlinedGUIDs);
+  void initWeightPropagation(FunctionT &F,
+                             const DenseSet<GlobalValue::GUID> &InlinedGUIDs);
+  void
+  finalizeWeightPropagation(FunctionT &F,
+                            const DenseSet<GlobalValue::GUID> &InlinedGUIDs);
    void emitCoverageRemarks(FunctionT &F);
  
    /// Map basic blocks to their computed weights.
@@ -741,50 +752,65 @@ void SampleProfileLoaderBaseImpl<BT>::buildEdges(FunctionT &F) {
  ///   known).
  template <typename BT>
  void SampleProfileLoaderBaseImpl<BT>::propagateWeights(FunctionT &F) {
-  bool Changed = true;
-  unsigned I = 0;
-
-  // If BB weight is larger than its corresponding loop's header BB weight,
-  // use the BB weight to replace the loop header BB weight.
-  for (auto &BI : F) {
-    BasicBlockT *BB = &BI;
-    LoopT *L = LI->getLoopFor(BB);
-    if (!L) {
-      continue;
+  // Flow-based profile inference is only usable with BasicBlock instantiation
+  // of SampleProfileLoaderBaseImpl.
+  if (SampleProfileUseProfi) {
+    // Prepare block sample counts for inference.
+    BlockWeightMap SampleBlockWeights;
+    for (const auto &BI : F) {
+      ErrorOr<uint64_t> Weight = getBlockWeight(&BI);
+      if (Weight)
+        SampleBlockWeights[&BI] = Weight.get();
      }
-    BasicBlockT *Header = L->getHeader();
-    if (Header && BlockWeights[BB] > BlockWeights[Header]) {
-      BlockWeights[Header] = BlockWeights[BB];
+    // Fill in BlockWeights and EdgeWeights using an inference algorithm.
+    applyProfi(F, Successors, SampleBlockWeights, BlockWeights, EdgeWeights);
+  } else {
+    bool Changed = true;
+    unsigned I = 0;
+
+    // If BB weight is larger than its corresponding loop's header BB weight,
+    // use the BB weight to replace the loop header BB weight.
+    for (auto &BI : F) {
+      BasicBlockT *BB = &BI;
+      LoopT *L = LI->getLoopFor(BB);
+      if (!L) {
+        continue;
+      }
+      BasicBlockT *Header = L->getHeader();
+      if (Header && BlockWeights[BB] > BlockWeights[Header]) {
+        BlockWeights[Header] = BlockWeights[BB];
+      }
      }
-  }
  
-  // Before propagation starts, build, for each block, a list of
-  // unique predecessors and successors. This is necessary to handle
-  // identical edges in multiway branches. Since we visit all blocks and all
-  // edges of the CFG, it is cleaner to build these lists once at the start
-  // of the pass.
-  buildEdges(F);
+    // Propagate until we converge or we go past the iteration limit.
+    while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+      Changed = propagateThroughEdges(F, false);
+    }
  
-  // Propagate until we converge or we go past the iteration limit.
-  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
-    Changed = propagateThroughEdges(F, false);
-  }
+    // The first propagation propagates BB counts from annotated BBs to unknown
+    // BBs. The 2nd propagation pass resets edges weights, and use all BB
+    // weights to propagate edge weights.
+    VisitedEdges.clear();
+    Changed = true;
+    while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+      Changed = propagateThroughEdges(F, false);
+    }
  
-  // The first propagation propagates BB counts from annotated BBs to unknown
-  // BBs. The 2nd propagation pass resets edges weights, and use all BB weights
-  // to propagate edge weights.
-  VisitedEdges.clear();
-  Changed = true;
-  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
-    Changed = propagateThroughEdges(F, false);
+    // The 3rd propagation pass allows adjust annotated BB weights that are
+    // obviously wrong.
+    Changed = true;
+    while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+      Changed = propagateThroughEdges(F, true);
+    }
    }
+}
  
-  // The 3rd propagation pass allows adjust annotated BB weights that are
-  // obviously wrong.
-  Changed = true;
-  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
-    Changed = propagateThroughEdges(F, true);
-  }
+template <typename BT>
+void SampleProfileLoaderBaseImpl<BT>::applyProfi(
+    FunctionT &F, BlockEdgeMap &Successors, BlockWeightMap &SampleBlockWeights,
+    BlockWeightMap &BlockWeights, EdgeWeightMap &EdgeWeights) {
+  auto Infer = SampleProfileInference<BT>(F, Successors, SampleBlockWeights);
+  Infer.apply(BlockWeights, EdgeWeights);
  }
  
  /// Generate branch weight metadata for all branches in \p F.
@@ -842,26 +868,64 @@ bool SampleProfileLoaderBaseImpl<BT>::computeAndPropagateWeights(
    Changed |= computeBlockWeights(F);
  
    if (Changed) {
-    // Add an entry count to the function using the samples gathered at the
-    // function entry.
-    // Sets the GUIDs that are inlined in the profiled binary. This is used
-    // for ThinLink to make correct liveness analysis, and also make the IR
-    // match the profiled binary before annotation.
-    getFunction(F).setEntryCount(
-        ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real),
-        &InlinedGUIDs);
+    // Initialize propagation.
+    initWeightPropagation(F, InlinedGUIDs);
  
+    // Propagate weights to all edges.
+    propagateWeights(F);
+
+    // Post-process propagated weights.
+    finalizeWeightPropagation(F, InlinedGUIDs);
+  }
+
+  return Changed;
+}
+
+template <typename BT>
+void SampleProfileLoaderBaseImpl<BT>::initWeightPropagation(
+    FunctionT &F, const DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
+  // Add an entry count to the function using the samples gathered at the
+  // function entry.
+  // Sets the GUIDs that are inlined in the profiled binary. This is used
+  // for ThinLink to make correct liveness analysis, and also make the IR
+  // match the profiled binary before annotation.
+  getFunction(F).setEntryCount(
+      ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real),
+      &InlinedGUIDs);
+
+  if (!SampleProfileUseProfi) {
      // Compute dominance and loop info needed for propagation.
      computeDominanceAndLoopInfo(F);
  
      // Find equivalence classes.
      findEquivalenceClasses(F);
-
-    // Propagate weights to all edges.
-    propagateWeights(F);
    }
  
-  return Changed;
+  // Before propagation starts, build, for each block, a list of
+  // unique predecessors and successors. This is necessary to handle
+  // identical edges in multiway branches. Since we visit all blocks and all
+  // edges of the CFG, it is cleaner to build these lists once at the start
+  // of the pass.
+  buildEdges(F);
+}
+
+template <typename BT>
+void SampleProfileLoaderBaseImpl<BT>::finalizeWeightPropagation(
+    FunctionT &F, const DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
+  // If we utilize a flow-based count inference, then we trust the computed
+  // counts and set the entry count as computed by the algorithm. This is
+  // primarily done to sync the counts produced by profi and BFI inference,
+  // which uses the entry count for mass propagation.
+  // If profi produces a zero-value for the entry count, we fallback to
+  // Samples->getHeadSamples() + 1 to avoid functions with zero count.
+  if (SampleProfileUseProfi) {
+    const BasicBlockT *EntryBB = getEntryBB(&F);
+    if (BlockWeights[EntryBB] > 0) {
+      getFunction(F).setEntryCount(
+          ProfileCount(BlockWeights[EntryBB], Function::PCT_Real),
+          &InlinedGUIDs);
+    }
+  }
  }
  
  template <typename BT>
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp

index 5430731ad8a8481bcd488b426f07460a84b963d8..b8fac9d47763fdd798a328bbb5f26cc791cac458 100644 (file)
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -84,6 +84,7 @@
  #include "llvm/Transforms/Instrumentation.h"
  #include "llvm/Transforms/Utils/CallPromotionUtils.h"
  #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/SampleProfileInference.h"
  #include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h"
  #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h"
  #include <algorithm>
@@ -1651,6 +1652,19 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
      SmallVector<uint32_t, 4> Weights;
      uint32_t MaxWeight = 0;
      Instruction *MaxDestInst;
+    // Since profi treats multiple edges (multiway branches) as a single edge,
+    // we need to distribute the computed weight among the branches. We do
+    // this by evenly splitting the edge weight among destinations.
+    DenseMap<const BasicBlock *, uint64_t> EdgeMultiplicity;
+    std::vector<uint64_t> EdgeIndex;
+    if (SampleProfileUseProfi) {
+      EdgeIndex.resize(TI->getNumSuccessors());
+      for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
+        const BasicBlock *Succ = TI->getSuccessor(I);
+        EdgeIndex[I] = EdgeMultiplicity[Succ];
+        EdgeMultiplicity[Succ]++;
+      }
+    }
      for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
        BasicBlock *Succ = TI->getSuccessor(I);
        Edge E = std::make_pair(BB, Succ);
@@ -1663,9 +1677,19 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
          LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
          Weight = std::numeric_limits<uint32_t>::max();
        }
-      // Weight is added by one to avoid propagation errors introduced by
-      // 0 weights.
-      Weights.push_back(static_cast<uint32_t>(Weight + 1));
+      if (!SampleProfileUseProfi) {
+        // Weight is added by one to avoid propagation errors introduced by
+        // 0 weights.
+        Weights.push_back(static_cast<uint32_t>(Weight + 1));
+      } else {
+        // Profi creates proper weights that do not require "+1" adjustments but
+        // we evenly split the weight among branches with the same destination.
+        uint64_t W = Weight / EdgeMultiplicity[Succ];
+        // Rounding up, if needed, so that first branches are hotter.
+        if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ])
+          W++;
+        Weights.push_back(static_cast<uint32_t>(W));
+      }
        if (Weight != 0) {
          if (Weight > MaxWeight) {
            MaxWeight = Weight;
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt

index be4f7125eb8534fe4768084db4988516d85c2789..22b9c0b19adabbeb692d9d639504c0c3e51b48e5 100644 (file)
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -60,6 +60,7 @@ add_llvm_component_library(LLVMTransformUtils
    StripGCRelocates.cpp
    SSAUpdater.cpp
    SSAUpdaterBulk.cpp
+  SampleProfileInference.cpp
    SampleProfileLoaderBaseUtil.cpp
    SanitizerStats.cpp
    SimplifyCFG.cpp
diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp

new file mode 100644 (file)

index 0000000..412a724
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
@@ -0,0 +1,461 @@
+//===- SampleProfileInference.cpp - Adjust sample profiles in the IR ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a profile inference algorithm. Given an incomplete and
+// possibly imprecise block counts, the algorithm reconstructs realistic block
+// and edge counts that satisfy flow conservation rules, while minimally modify
+// input block counts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SampleProfileInference.h"
+#include "llvm/Support/Debug.h"
+#include <queue>
+#include <set>
+
+using namespace llvm;
+#define DEBUG_TYPE "sample-profile-inference"
+
+namespace {
+
+/// A value indicating an infinite flow/capacity/weight of a block/edge.
+/// Not using numeric_limits<int64_t>::max(), as the values can be summed up
+/// during the execution.
+static constexpr int64_t INF = ((int64_t)1) << 50;
+
+/// The minimum-cost maximum flow algorithm.
+///
+/// The algorithm finds the maximum flow of minimum cost on a given (directed)
+/// network using a modified version of the classical Moore-Bellman-Ford
+/// approach. The algorithm applies a number of augmentation iterations in which
+/// flow is sent along paths of positive capacity from the source to the sink.
+/// The worst-case time complexity of the implementation is O(v(f)*m*n), where
+/// where m is the number of edges, n is the number of vertices, and v(f) is the
+/// value of the maximum flow. However, the observed running time on typical
+/// instances is sub-quadratic, that is, o(n^2).
+///
+/// The input is a set of edges with specified costs and capacities, and a pair
+/// of nodes (source and sink). The output is the flow along each edge of the
+/// minimum total cost respecting the given edge capacities.
+class MinCostMaxFlow {
+public:
+  // Initialize algorithm's data structures for a network of a given size.
+  void initialize(uint64_t NodeCount, uint64_t SourceNode, uint64_t SinkNode) {
+    Source = SourceNode;
+    Target = SinkNode;
+
+    Nodes = std::vector<Node>(NodeCount);
+    Edges = std::vector<std::vector<Edge>>(NodeCount, std::vector<Edge>());
+  }
+
+  // Run the algorithm.
+  int64_t run() {
+    // Find an augmenting path and update the flow along the path
+    size_t AugmentationIters = 0;
+    while (findAugmentingPath()) {
+      augmentFlowAlongPath();
+      AugmentationIters++;
+    }
+
+    // Compute the total flow and its cost
+    int64_t TotalCost = 0;
+    int64_t TotalFlow = 0;
+    for (uint64_t Src = 0; Src < Nodes.size(); Src++) {
+      for (auto &Edge : Edges[Src]) {
+        if (Edge.Flow > 0) {
+          TotalCost += Edge.Cost * Edge.Flow;
+          if (Src == Source)
+            TotalFlow += Edge.Flow;
+        }
+      }
+    }
+    LLVM_DEBUG(dbgs() << "Completed profi after " << AugmentationIters
+                      << " iterations with " << TotalFlow << " total flow"
+                      << " of " << TotalCost << " cost\n");
+    return TotalCost;
+  }
+
+  /// Adding an edge to the network with a specified capacity and a cost.
+  /// Multiple edges between a pair of nodes are allowed but self-edges
+  /// are not supported.
+  void addEdge(uint64_t Src, uint64_t Dst, int64_t Capacity, int64_t Cost) {
+    assert(Capacity > 0 && "adding an edge of zero capacity");
+    assert(Src != Dst && "loop edge are not supported");
+
+    Edge SrcEdge;
+    SrcEdge.Dst = Dst;
+    SrcEdge.Cost = Cost;
+    SrcEdge.Capacity = Capacity;
+    SrcEdge.Flow = 0;
+    SrcEdge.RevEdgeIndex = Edges[Dst].size();
+
+    Edge DstEdge;
+    DstEdge.Dst = Src;
+    DstEdge.Cost = -Cost;
+    DstEdge.Capacity = 0;
+    DstEdge.Flow = 0;
+    DstEdge.RevEdgeIndex = Edges[Src].size();
+
+    Edges[Src].push_back(SrcEdge);
+    Edges[Dst].push_back(DstEdge);
+  }
+
+  /// Adding an edge to the network of infinite capacity and a given cost.
+  void addEdge(uint64_t Src, uint64_t Dst, int64_t Cost) {
+    addEdge(Src, Dst, INF, Cost);
+  }
+
+  /// Get the total flow from a given source node.
+  /// Returns a list of pairs (target node, amount of flow to the target).
+  const std::vector<std::pair<uint64_t, int64_t>> getFlow(uint64_t Src) const {
+    std::vector<std::pair<uint64_t, int64_t>> Flow;
+    for (auto &Edge : Edges[Src]) {
+      if (Edge.Flow > 0)
+        Flow.push_back(std::make_pair(Edge.Dst, Edge.Flow));
+    }
+    return Flow;
+  }
+
+  /// Get the total flow between a pair of nodes.
+  int64_t getFlow(uint64_t Src, uint64_t Dst) const {
+    int64_t Flow = 0;
+    for (auto &Edge : Edges[Src]) {
+      if (Edge.Dst == Dst) {
+        Flow += Edge.Flow;
+      }
+    }
+    return Flow;
+  }
+
+  /// A cost of increasing a block's count by one.
+  static constexpr int64_t AuxCostInc = 10;
+  /// A cost of decreasing a block's count by one.
+  static constexpr int64_t AuxCostDec = 20;
+  /// A cost of increasing a count of zero-weight block by one.
+  static constexpr int64_t AuxCostIncZero = 11;
+  /// A cost of increasing the entry block's count by one.
+  static constexpr int64_t AuxCostIncEntry = 40;
+  /// A cost of decreasing the entry block's count by one.
+  static constexpr int64_t AuxCostDecEntry = 10;
+  /// A cost of taking an unlikely jump.
+  static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 20;
+
+private:
+  /// Check for existence of an augmenting path with a positive capacity.
+  bool findAugmentingPath() {
+    // Initialize data structures
+    for (auto &Node : Nodes) {
+      Node.Distance = INF;
+      Node.ParentNode = uint64_t(-1);
+      Node.ParentEdgeIndex = uint64_t(-1);
+      Node.Taken = false;
+    }
+
+    std::queue<uint64_t> Queue;
+    Queue.push(Source);
+    Nodes[Source].Distance = 0;
+    Nodes[Source].Taken = true;
+    while (!Queue.empty()) {
+      uint64_t Src = Queue.front();
+      Queue.pop();
+      Nodes[Src].Taken = false;
+      // Although the residual network contains edges with negative costs
+      // (in particular, backward edges), it can be shown that there are no
+      // negative-weight cycles and the following two invariants are maintained:
+      // (i) Dist[Source, V] >= 0 and (ii) Dist[V, Target] >= 0 for all nodes V,
+      // where Dist is the length of the shortest path between two nodes. This
+      // allows to prune the search-space of the path-finding algorithm using
+      // the following early-stop criteria:
+      // -- If we find a path with zero-distance from Source to Target, stop the
+      //    search, as the path is the shortest since Dist[Source, Target] >= 0;
+      // -- If we have Dist[Source, V] > Dist[Source, Target], then do not
+      //    process node V, as it is guaranteed _not_ to be on a shortest path
+      //    from Source to Target; it follows from inequalities
+      //    Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target]
+      //                         >= Dist[Source, V]
+      if (Nodes[Target].Distance == 0)
+        break;
+      if (Nodes[Src].Distance > Nodes[Target].Distance)
+        continue;
+
+      // Process adjacent edges
+      for (uint64_t EdgeIdx = 0; EdgeIdx < Edges[Src].size(); EdgeIdx++) {
+        auto &Edge = Edges[Src][EdgeIdx];
+        if (Edge.Flow < Edge.Capacity) {
+          uint64_t Dst = Edge.Dst;
+          int64_t NewDistance = Nodes[Src].Distance + Edge.Cost;
+          if (Nodes[Dst].Distance > NewDistance) {
+            // Update the distance and the parent node/edge
+            Nodes[Dst].Distance = NewDistance;
+            Nodes[Dst].ParentNode = Src;
+            Nodes[Dst].ParentEdgeIndex = EdgeIdx;
+            // Add the node to the queue, if it is not there yet
+            if (!Nodes[Dst].Taken) {
+              Queue.push(Dst);
+              Nodes[Dst].Taken = true;
+            }
+          }
+        }
+      }
+    }
+
+    return Nodes[Target].Distance != INF;
+  }
+
+  /// Update the current flow along the augmenting path.
+  void augmentFlowAlongPath() {
+    // Find path capacity
+    int64_t PathCapacity = INF;
+    uint64_t Now = Target;
+    while (Now != Source) {
+      uint64_t Pred = Nodes[Now].ParentNode;
+      auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
+      PathCapacity = std::min(PathCapacity, Edge.Capacity - Edge.Flow);
+      Now = Pred;
+    }
+
+    assert(PathCapacity > 0 && "found incorrect augmenting path");
+
+    // Update the flow along the path
+    Now = Target;
+    while (Now != Source) {
+      uint64_t Pred = Nodes[Now].ParentNode;
+      auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
+      auto &RevEdge = Edges[Now][Edge.RevEdgeIndex];
+
+      Edge.Flow += PathCapacity;
+      RevEdge.Flow -= PathCapacity;
+
+      Now = Pred;
+    }
+  }
+
+  /// An node in a flow network.
+  struct Node {
+    /// The cost of the cheapest path from the source to the current node.
+    int64_t Distance;
+    /// The node preceding the current one in the path.
+    uint64_t ParentNode;
+    /// The index of the edge between ParentNode and the current node.
+    uint64_t ParentEdgeIndex;
+    /// An indicator of whether the current node is in a queue.
+    bool Taken;
+  };
+  /// An edge in a flow network.
+  struct Edge {
+    /// The cost of the edge.
+    int64_t Cost;
+    /// The capacity of the edge.
+    int64_t Capacity;
+    /// The current flow on the edge.
+    int64_t Flow;
+    /// The destination node of the edge.
+    uint64_t Dst;
+    /// The index of the reverse edge between Dst and the current node.
+    uint64_t RevEdgeIndex;
+  };
+
+  /// The set of network nodes.
+  std::vector<Node> Nodes;
+  /// The set of network edges.
+  std::vector<std::vector<Edge>> Edges;
+  /// Source node of the flow.
+  uint64_t Source;
+  /// Target (sink) node of the flow.
+  uint64_t Target;
+};
+
+/// Initializing flow network for a given function.
+///
+/// Every block is split into three nodes that are responsible for (i) an
+/// incoming flow, (ii) an outgoing flow, and (iii) penalizing an increase or
+/// reduction of the block weight.
+void initializeNetwork(MinCostMaxFlow &Network, FlowFunction &Func) {
+  uint64_t NumBlocks = Func.Blocks.size();
+  assert(NumBlocks > 1 && "Too few blocks in a function");
+  LLVM_DEBUG(dbgs() << "Initializing profi for " << NumBlocks << " blocks\n");
+
+  // Pre-process data: make sure the entry weight is at least 1
+  if (Func.Blocks[Func.Entry].Weight == 0) {
+    Func.Blocks[Func.Entry].Weight = 1;
+  }
+  // Introducing dummy source/sink pairs to allow flow circulation.
+  // The nodes corresponding to blocks of Func have indicies in the range
+  // [0..3 * NumBlocks); the dummy nodes are indexed by the next four values.
+  uint64_t S = 3 * NumBlocks;
+  uint64_t T = S + 1;
+  uint64_t S1 = S + 2;
+  uint64_t T1 = S + 3;
+
+  Network.initialize(3 * NumBlocks + 4, S1, T1);
+
+  // Create three nodes for every block of the function
+  for (uint64_t B = 0; B < NumBlocks; B++) {
+    auto &Block = Func.Blocks[B];
+    assert((!Block.UnknownWeight || Block.Weight == 0 || Block.isEntry()) &&
+           "non-zero weight of a block w/o weight except for an entry");
+
+    // Split every block into two nodes
+    uint64_t Bin = 3 * B;
+    uint64_t Bout = 3 * B + 1;
+    uint64_t Baux = 3 * B + 2;
+    if (Block.Weight > 0) {
+      Network.addEdge(S1, Bout, Block.Weight, 0);
+      Network.addEdge(Bin, T1, Block.Weight, 0);
+    }
+
+    // Edges from S and to T
+    assert((!Block.isEntry() || !Block.isExit()) &&
+           "a block cannot be an entry and an exit");
+    if (Block.isEntry()) {
+      Network.addEdge(S, Bin, 0);
+    } else if (Block.isExit()) {
+      Network.addEdge(Bout, T, 0);
+    }
+
+    // An auxiliary node to allow increase/reduction of block counts:
+    // We assume that decreasing block counts is more expensive than increasing,
+    // and thus, setting separate costs here. In the future we may want to tune
+    // the relative costs so as to maximize the quality of generated profiles.
+    int64_t AuxCostInc = MinCostMaxFlow::AuxCostInc;
+    int64_t AuxCostDec = MinCostMaxFlow::AuxCostDec;
+    if (Block.UnknownWeight) {
+      // Do not penalize changing weights of blocks w/o known profile count
+      AuxCostInc = 0;
+      AuxCostDec = 0;
+    } else {
+      // Increasing the count for "cold" blocks with zero initial count is more
+      // expensive than for "hot" ones
+      if (Block.Weight == 0) {
+        AuxCostInc = MinCostMaxFlow::AuxCostIncZero;
+      }
+      // Modifying the count of the entry block is expensive
+      if (Block.isEntry()) {
+        AuxCostInc = MinCostMaxFlow::AuxCostIncEntry;
+        AuxCostDec = MinCostMaxFlow::AuxCostDecEntry;
+      }
+    }
+    // For blocks with self-edges, do not penalize a reduction of the count,
+    // as all of the increase can be attributed to the self-edge
+    if (Block.HasSelfEdge) {
+      AuxCostDec = 0;
+    }
+
+    Network.addEdge(Bin, Baux, AuxCostInc);
+    Network.addEdge(Baux, Bout, AuxCostInc);
+    if (Block.Weight > 0) {
+      Network.addEdge(Bout, Baux, AuxCostDec);
+      Network.addEdge(Baux, Bin, AuxCostDec);
+    }
+  }
+
+  // Creating edges for every jump
+  for (auto &Jump : Func.Jumps) {
+    uint64_t Src = Jump.Source;
+    uint64_t Dst = Jump.Target;
+    if (Src != Dst) {
+      uint64_t SrcOut = 3 * Src + 1;
+      uint64_t DstIn = 3 * Dst;
+      uint64_t Cost = Jump.IsUnlikely ? MinCostMaxFlow::AuxCostUnlikely : 0;
+      Network.addEdge(SrcOut, DstIn, Cost);
+    }
+  }
+
+  // Make sure we have a valid flow circulation
+  Network.addEdge(T, S, 0);
+}
+
+/// Extract resulting block and edge counts from the flow network.
+void extractWeights(MinCostMaxFlow &Network, FlowFunction &Func) {
+  uint64_t NumBlocks = Func.Blocks.size();
+
+  // Extract resulting block counts
+  for (uint64_t Src = 0; Src < NumBlocks; Src++) {
+    auto &Block = Func.Blocks[Src];
+    uint64_t SrcOut = 3 * Src + 1;
+    int64_t Flow = 0;
+    for (auto &Adj : Network.getFlow(SrcOut)) {
+      uint64_t DstIn = Adj.first;
+      int64_t DstFlow = Adj.second;
+      bool IsAuxNode = (DstIn < 3 * NumBlocks && DstIn % 3 == 2);
+      if (!IsAuxNode || Block.HasSelfEdge) {
+        Flow += DstFlow;
+      }
+    }
+    Block.Flow = Flow;
+    assert(Flow >= 0 && "negative block flow");
+  }
+
+  // Extract resulting jump counts
+  for (auto &Jump : Func.Jumps) {
+    uint64_t Src = Jump.Source;
+    uint64_t Dst = Jump.Target;
+    int64_t Flow = 0;
+    if (Src != Dst) {
+      uint64_t SrcOut = 3 * Src + 1;
+      uint64_t DstIn = 3 * Dst;
+      Flow = Network.getFlow(SrcOut, DstIn);
+    } else {
+      uint64_t SrcOut = 3 * Src + 1;
+      uint64_t SrcAux = 3 * Src + 2;
+      int64_t AuxFlow = Network.getFlow(SrcOut, SrcAux);
+      if (AuxFlow > 0)
+        Flow = AuxFlow;
+    }
+    Jump.Flow = Flow;
+    assert(Flow >= 0 && "negative jump flow");
+  }
+}
+
+#ifndef NDEBUG
+/// Verify that the computed flow values satisfy flow conservation rules
+void verifyWeights(const FlowFunction &Func) {
+  const uint64_t NumBlocks = Func.Blocks.size();
+  auto InFlow = std::vector<uint64_t>(NumBlocks, 0);
+  auto OutFlow = std::vector<uint64_t>(NumBlocks, 0);
+  for (auto &Jump : Func.Jumps) {
+    InFlow[Jump.Target] += Jump.Flow;
+    OutFlow[Jump.Source] += Jump.Flow;
+  }
+
+  uint64_t TotalInFlow = 0;
+  uint64_t TotalOutFlow = 0;
+  for (uint64_t I = 0; I < NumBlocks; I++) {
+    auto &Block = Func.Blocks[I];
+    if (Block.isEntry()) {
+      TotalInFlow += Block.Flow;
+      assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
+    } else if (Block.isExit()) {
+      TotalOutFlow += Block.Flow;
+      assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
+    } else {
+      assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
+      assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
+    }
+  }
+  assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow");
+}
+#endif
+
+} // end of anonymous namespace
+
+/// Apply the profile inference algorithm for a given flow function
+void llvm::applyFlowInference(FlowFunction &Func) {
+  // Create and apply an inference network model
+  auto InferenceNetwork = MinCostMaxFlow();
+  initializeNetwork(InferenceNetwork, Func);
+  InferenceNetwork.run();
+
+  // Extract flow values for every block and every edge
+  extractWeights(InferenceNetwork, Func);
+
+#ifndef NDEBUG
+  // Verify the result
+  verifyWeights(Func);
+#endif
+}
diff --git a/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp b/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp

index 6d995cf4c0481fc3fd9b6201c956b1d3c8de8027..ea0e8343eb8876bc3eea993c9e162111115a11d2 100644 (file)
--- a/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp
+++ b/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp
@@ -34,6 +34,10 @@ cl::opt<bool> NoWarnSampleUnused(
      cl::desc("Use this option to turn off/on warnings about function with "
               "samples but without debug information to use those samples. "));
  
+cl::opt<bool> SampleProfileUseProfi(
+    "sample-profile-use-profi", cl::init(false), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Use profi to infer block and edge counts."));
+
  namespace sampleprofutil {
  
  /// Return true if the given callsite is hot wrt to hot cutoff threshold.
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-inference.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-inference.prof

new file mode 100644 (file)

index 0000000..e995a04
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-inference.prof
@@ -0,0 +1,23 @@
+test_1:23968:0
+ 1: 100
+ 2: 60
+ 3: 40
+ !CFGChecksum: 4294967295
+
+test_2:23968:0
+ 1: 100
+ 3: 10
+ !CFGChecksum: 37753817093
+
+test_3:10000:0
+ 3: 13
+ 5: 89
+ !CFGChecksum: 69502983527
+
+sum_of_squares:23968:0
+ 2: 5993
+ 3: 1
+ 4: 5992
+ 5: 5992
+ 8: 5992
+ !CFGChecksum: 175862120757
diff --git a/llvm/test/Transforms/SampleProfile/profile-inference.ll b/llvm/test/Transforms/SampleProfile/profile-inference.ll

new file mode 100644 (file)

index 0000000..7f40358
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/profile-inference.ll
@@ -0,0 +1,245 @@
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference.prof | opt -analyze -branch-prob -enable-new-pm=0 | FileCheck %s
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference.prof | opt -analyze -block-freq  -enable-new-pm=0 | FileCheck %s --check-prefix=CHECK2
+
+; The test verifies that profile inference correctly builds branch probabilities
+; from sampling-based block counts.
+;
+; +---------+     +----------+
+; | b3 [40] | <-- | b1 [100] |
+; +---------+     +----------+
+;                   |
+;                   |
+;                   v
+;                 +----------+
+;                 | b2 [60]  |
+;                 +----------+
+
+@yydebug = dso_local global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @test_1() #0 {
+b1:
+  call void @llvm.pseudoprobe(i64 7964825052912775246, i64 1, i32 0, i64 -1)
+  %0 = load i32, i32* @yydebug, align 4
+  %cmp = icmp ne i32 %0, 0
+  br i1 %cmp, label %b2, label %b3
+; CHECK:  edge b1 -> b2 probability is 0x4ccccccd / 0x80000000 = 60.00%
+; CHECK:  edge b1 -> b3 probability is 0x33333333 / 0x80000000 = 40.00%
+; CHECK2: - b1: float = {{.*}}, int = {{.*}}, count = 100
+
+b2:
+  call void @llvm.pseudoprobe(i64 7964825052912775246, i64 2, i32 0, i64 -1)
+  ret i32 %0
+; CHECK2: - b2: float = {{.*}}, int = {{.*}}, count = 60
+
+b3:
+  call void @llvm.pseudoprobe(i64 7964825052912775246, i64 3, i32 0, i64 -1)
+  ret i32 %0
+; CHECK2: - b3: float = {{.*}}, int = {{.*}}, count = 40
+}
+
+
+; The test verifies that profile inference correctly builds branch probabilities
+; from sampling-based block counts in the presence of "dangling" probes (whose
+; block counts are missing).
+;
+; +---------+     +----------+
+; | b3 [10] | <-- | b1 [100] |
+; +---------+     +----------+
+;                   |
+;                   |
+;                   v
+;                 +----------+
+;                 | b2 [?]  |
+;                 +----------+
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @test_2() #0 {
+b1:
+  call void @llvm.pseudoprobe(i64 -6216829535442445639, i64 1, i32 0, i64 -1)
+  %0 = load i32, i32* @yydebug, align 4
+  %cmp = icmp ne i32 %0, 0
+  br i1 %cmp, label %b2, label %b3
+; CHECK:  edge b1 -> b2 probability is 0x73333333 / 0x80000000 = 90.00%
+; CHECK:  edge b1 -> b3 probability is 0x0ccccccd / 0x80000000 = 10.00%
+; CHECK2: - b1: float = {{.*}}, int = {{.*}}, count = 100
+
+b2:
+  call void @llvm.pseudoprobe(i64 -6216829535442445639, i64 2, i32 0, i64 -1)
+  ret i32 %0
+; CHECK2: - b2: float = {{.*}}, int = {{.*}}, count = 90
+
+b3:
+  call void @llvm.pseudoprobe(i64 -6216829535442445639, i64 3, i32 0, i64 -1)
+  ret i32 %0
+}
+; CHECK2: - b3: float = {{.*}}, int = {{.*}}, count = 10
+
+
+; The test verifies that profi is able to infer block counts from hot subgraphs.
+;
+; +---------+     +---------+
+; | b4 [?]  | <-- | b1 [?]  |
+; +---------+     +---------+
+;   |               |
+;   |               |
+;   v               v
+; +---------+     +---------+
+; | b5 [89] |     | b2 [?]  |
+; +---------+     +---------+
+;                   |
+;                   |
+;                   v
+;                 +---------+
+;                 | b3 [13] |
+;                 +---------+
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @test_3() #0 {
+b1:
+  call void @llvm.pseudoprobe(i64 1649282507922421973, i64 1, i32 0, i64 -1)
+  %0 = load i32, i32* @yydebug, align 4
+  %cmp = icmp ne i32 %0, 0
+  br i1 %cmp, label %b2, label %b4
+; CHECK:  edge b1 -> b2 probability is 0x10505050 / 0x80000000 = 12.75%
+; CHECK:  edge b1 -> b4 probability is 0x6fafafb0 / 0x80000000 = 87.25%
+; CHECK2: - b1: float = {{.*}}, int = {{.*}}, count = 102
+
+b2:
+  call void @llvm.pseudoprobe(i64 1649282507922421973, i64 2, i32 0, i64 -1)
+  br label %b3
+; CHECK:  edge b2 -> b3 probability is 0x80000000 / 0x80000000 = 100.00%
+; CHECK2: - b2: float = {{.*}}, int = {{.*}}, count = 13
+
+b3:
+  call void @llvm.pseudoprobe(i64 1649282507922421973, i64 3, i32 0, i64 -1)
+  ret i32 %0
+; CHECK2: - b3: float = {{.*}}, int = {{.*}}, count = 13
+
+b4:
+  call void @llvm.pseudoprobe(i64 1649282507922421973, i64 4, i32 0, i64 -1)
+  br label %b5
+; CHECK:  edge b4 -> b5 probability is 0x80000000 / 0x80000000 = 100.00%
+; CHECK2: - b4: float = {{.*}}, int = {{.*}}, count = 89
+
+b5:
+  call void @llvm.pseudoprobe(i64 1649282507922421973, i64 5, i32 0, i64 -1)
+  ret i32 %0
+; CHECK2: - b5: float = {{.*}}, int = {{.*}}, count = 89
+}
+
+
+; A larger test to verify that profile inference correctly identifies hot parts
+; of the control-flow graph.
+;
+;                +-----------+
+;                |  b1 [?]   |
+;                +-----------+
+;                  |
+;                  |
+;                  v
+; +--------+     +-----------+
+; | b3 [1] | <-- | b2 [5993] |
+; +--------+     +-----------+
+;   |              |
+;   |              |
+;   |              v
+;   |            +-----------+     +--------+
+;   |            | b4 [5992] | --> | b6 [?] |
+;   |            +-----------+     +--------+
+;   |              |                 |
+;   |              |                 |
+;   |              v                 |
+;   |            +-----------+       |
+;   |            | b5 [5992] |       |
+;   |            +-----------+       |
+;   |              |                 |
+;   |              |                 |
+;   |              v                 |
+;   |            +-----------+       |
+;   |            |  b7 [?]   |       |
+;   |            +-----------+       |
+;   |              |                 |
+;   |              |                 |
+;   |              v                 |
+;   |            +-----------+       |
+;   |            | b8 [5992] | <-----+
+;   |            +-----------+
+;   |              |
+;   |              |
+;   |              v
+;   |            +-----------+
+;   +----------> |  b9 [?]   |
+;                +-----------+
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @sum_of_squares() #0 {
+b1:
+  call void @llvm.pseudoprobe(i64 -907520326213521421, i64 1, i32 0, i64 -1)
+  %0 = load i32, i32* @yydebug, align 4
+  %cmp = icmp ne i32 %0, 0
+  br label %b2
+; CHECK:  edge b1 -> b2 probability is 0x80000000 / 0x80000000 = 100.00%
+; CHECK2: - b1: float = {{.*}}, int = {{.*}}, count = 5993
+
+b2:
+  call void @llvm.pseudoprobe(i64 -907520326213521421, i64 2, i32 0, i64 -1)
+  br i1 %cmp, label %b4, label %b3
+; CHECK:  edge b2 -> b4 probability is 0x7ffa8844 / 0x80000000 = 99.98%
+; CHECK:  edge b2 -> b3 probability is 0x000577bc / 0x80000000 = 0.02%
+; CHECK2: - b2: float = {{.*}}, int = {{.*}}, count = 5993
+
+b3:
+  call void @llvm.pseudoprobe(i64 -907520326213521421, i64 3, i32 0, i64 -1)
+  br label %b9
+; CHECK:  edge b3 -> b9 probability is 0x80000000 / 0x80000000 = 100.00%
+; CHECK2: - b3: float = {{.*}}, int = {{.*}}, count = 1
+
+b4:
+  call void @llvm.pseudoprobe(i64 -907520326213521421, i64 4, i32 0, i64 -1)
+  br i1 %cmp, label %b5, label %b6
+; CHECK:  edge b4 -> b5 probability is 0x80000000 / 0x80000000 = 100.00%
+; CHECK:  edge b4 -> b6 probability is 0x00000000 / 0x80000000 = 0.00%
+; CHECK2: - b4: float = {{.*}}, int = {{.*}}, count = 5992
+
+b5:
+  call void @llvm.pseudoprobe(i64 -907520326213521421, i64 5, i32 0, i64 -1)
+  br label %b7
+; CHECK:  edge b5 -> b7 probability is 0x80000000 / 0x80000000 = 100.00%
+; CHECK2: - b5: float = {{.*}}, int = {{.*}}, count = 5992
+
+b6:
+  call void @llvm.pseudoprobe(i64 -907520326213521421, i64 6, i32 0, i64 -1)
+  br label %b8
+; CHECK:  edge b6 -> b8 probability is 0x80000000 / 0x80000000 = 100.00%
+; CHECK2: - b6: float = {{.*}}, int = {{.*}}, count = 0
+
+b7:
+  call void @llvm.pseudoprobe(i64 -907520326213521421, i64 7, i32 0, i64 -1)
+  br label %b8
+; CHECK:  edge b7 -> b8 probability is 0x80000000 / 0x80000000 = 100.00%
+; CHECK2: - b7: float = {{.*}}, int = {{.*}}, count = 5992
+
+b8:
+  call void @llvm.pseudoprobe(i64 -907520326213521421, i64 8, i32 0, i64 -1)
+  br label %b9
+; CHECK:  edge b8 -> b9 probability is 0x80000000 / 0x80000000 = 100.00%
+; CHECK2: - b8: float = {{.*}}, int = {{.*}}, count = 5992
+
+b9:
+  call void @llvm.pseudoprobe(i64 -907520326213521421, i64 9, i32 0, i64 -1)
+  ret i32 %0
+}
+; CHECK2: - b9: float = {{.*}}, int = {{.*}}, count = 5993
+
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #1
+
+attributes #0 = { noinline nounwind uwtable "use-sample-profile"}
+attributes #1 = { nounwind }
+
+!llvm.pseudo_probe_desc = !{!6, !7, !8, !9}
+
+!6 = !{i64 7964825052912775246, i64 4294967295, !"test_1", null}
+!7 = !{i64 -6216829535442445639, i64 37753817093, !"test_2", null}
+!8 = !{i64 1649282507922421973, i64 69502983527, !"test_3", null}
+!9 = !{i64 -907520326213521421, i64 175862120757, !"sum_of_squares", null}
author	spupyrev <spupyrev@fb.com>
	Wed, 1 Dec 2021 23:29:35 +0000 (15:29 -0800)
committer	Hongtao Yu <hoy@fb.com>
	Wed, 1 Dec 2021 23:30:38 +0000 (15:30 -0800)
llvm/include/llvm/Transforms/Utils/SampleProfileInference.h	[new file with mode: 0644]	patch \| blob
llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h		patch \| blob \| history
llvm/lib/Transforms/IPO/SampleProfile.cpp		patch \| blob \| history
llvm/lib/Transforms/Utils/CMakeLists.txt		patch \| blob \| history
llvm/lib/Transforms/Utils/SampleProfileInference.cpp	[new file with mode: 0644]	patch \| blob
llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp		patch \| blob \| history
llvm/test/Transforms/SampleProfile/Inputs/profile-inference.prof	[new file with mode: 0644]	patch \| blob
llvm/test/Transforms/SampleProfile/profile-inference.ll	[new file with mode: 0644]	patch \| blob