[BOLT] stale profile matching [part 1 out of 2]

author spupyrev <spupyrev@fb.com>

Thu, 16 Feb 2023 18:52:04 +0000 (10:52 -0800)

committer spupyrev <spupyrev@fb.com>

Tue, 6 Jun 2023 19:13:52 +0000 (12:13 -0700)
author spupyrev <spupyrev@fb.com>
Thu, 16 Feb 2023 18:52:04 +0000 (10:52 -0800)
committer spupyrev <spupyrev@fb.com>
Tue, 6 Jun 2023 19:13:52 +0000 (12:13 -0700)
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h

index 60d079c..cfc586d 100644 (file)
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -384,6 +384,10 @@ private:
    /// Indicates the type of profile the function is using.
    uint16_t ProfileFlags{PF_NONE};
  
+  /// True if the function's input profile data has been inaccurate but has
+  /// been adjusted by the profile inference algorithm.
+  bool HasInferredProfile{false};
+
    /// For functions with mismatched profile we store all call profile
    /// information at a function level (as opposed to tying it to
    /// specific call sites).
@@ -1566,6 +1570,12 @@ public:
    /// Return flags describing a profile for this function.
    uint16_t getProfileFlags() const { return ProfileFlags; }
  
+  /// Return true if the function's input profile data has been inaccurate but
+  /// has been corrected by the profile inference algorithm.
+  bool hasInferredProfile() const { return HasInferredProfile; }
+
+  void setHasInferredProfile(bool Inferred) { HasInferredProfile = Inferred; }
+
    void addCFIInstruction(uint64_t Offset, MCCFIInstruction &&Inst) {
      assert(!Instructions.empty());
  
diff --git a/bolt/include/bolt/Profile/YAMLProfileReader.h b/bolt/include/bolt/Profile/YAMLProfileReader.h

index 255ab07..f6e6021 100644 (file)
--- a/bolt/include/bolt/Profile/YAMLProfileReader.h
+++ b/bolt/include/bolt/Profile/YAMLProfileReader.h
@@ -70,6 +70,10 @@ private:
    bool parseFunctionProfile(BinaryFunction &Function,
                              const yaml::bolt::BinaryFunctionProfile &YamlBF);
  
+  /// Infer function profile from stale data (collected on older binaries).
+  bool inferStaleProfile(BinaryFunction &Function,
+                         const yaml::bolt::BinaryFunctionProfile &YamlBF);
+
    /// Initialize maps for profile matching.
    void buildNameMaps(std::map<uint64_t, BinaryFunction> &Functions);
  
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp

index 8721338..824f3e1 100644 (file)
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1341,10 +1341,13 @@ void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
  void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
    uint64_t NumRegularFunctions = 0;
    uint64_t NumStaleProfileFunctions = 0;
+  uint64_t NumAllStaleFunctions = 0;
+  uint64_t NumInferredFunctions = 0;
    uint64_t NumNonSimpleProfiledFunctions = 0;
    uint64_t NumUnknownControlFlowFunctions = 0;
    uint64_t TotalSampleCount = 0;
    uint64_t StaleSampleCount = 0;
+  uint64_t InferredSampleCount = 0;
    std::vector<const BinaryFunction *> ProfiledFunctions;
    const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
    for (auto &BFI : BC.getBinaryFunctions()) {
@@ -1379,6 +1382,11 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
  
      if (Function.hasValidProfile()) {
        ProfiledFunctions.push_back(&Function);
+      if (Function.hasInferredProfile()) {
+        ++NumInferredFunctions;
+        InferredSampleCount += SampleCount;
+        ++NumAllStaleFunctions;
+      }
      } else {
        if (opts::ReportStaleFuncs) {
          outs() << StaleFuncsHeader;
@@ -1387,6 +1395,7 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
        }
        ++NumStaleProfileFunctions;
        StaleSampleCount += SampleCount;
+      ++NumAllStaleFunctions;
      }
    }
    BC.NumProfiledFuncs = ProfiledFunctions.size();
@@ -1433,6 +1442,16 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
        exit(1);
      }
    }
+  if (NumInferredFunctions) {
+    outs() << format("BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
+                     "%.2f%% of stale) functions responsible for %.2f%% samples"
+                     " (%zu out of %zu)\n",
+                     NumInferredFunctions,
+                     100.0 * NumInferredFunctions / NumAllProfiledFunctions,
+                     100.0 * NumInferredFunctions / NumAllStaleFunctions,
+                     100.0 * InferredSampleCount / TotalSampleCount,
+                     InferredSampleCount, TotalSampleCount);
+  }
  
    if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
      outs() << "BOLT-INFO: profile for " << NumUnusedObjects
diff --git a/bolt/lib/Profile/CMakeLists.txt b/bolt/lib/Profile/CMakeLists.txt

index 0b5d771..3a31a9c 100644 (file)
--- a/bolt/lib/Profile/CMakeLists.txt
+++ b/bolt/lib/Profile/CMakeLists.txt
@@ -4,6 +4,7 @@ add_llvm_library(LLVMBOLTProfile
    DataReader.cpp
    Heatmap.cpp
    ProfileReaderBase.cpp
+  StaleProfileMatching.cpp
    YAMLProfileReader.cpp
    YAMLProfileWriter.cpp
  
@@ -11,6 +12,7 @@ add_llvm_library(LLVMBOLTProfile
  
    LINK_COMPONENTS
    Support
+  TransformUtils
    )
  
  target_link_libraries(LLVMBOLTProfile
diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp

new file mode 100644 (file)

index 0000000..b6c18c2
--- /dev/null
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -0,0 +1,546 @@
+//===- bolt/Profile/StaleProfileMatching.cpp - Profile data matching   ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// BOLT often has to deal with profiles collected on binaries built from several
+// revisions behind release. As a result, a certain percentage of functions is
+// considered stale and not optimized. This file implements an ability to match
+// profile to functions that are not 100% binary identical, and thus, increasing
+// the optimization coverage and boost the performance of applications.
+//
+// The algorithm consists of two phases: matching and inference:
+// - At the matching phase, we try to "guess" as many block and jump counts from
+//   the stale profile as possible. To this end, the content of each basic block
+//   is hashed and stored in the (yaml) profile. When BOLT optimizes a binary,
+//   it computes block hashes and identifies the corresponding entries in the
+//   stale profile. It yields a partial profile for every CFG in the binary.
+// - At the inference phase, we employ a network flow-based algorithm (profi) to
+//   reconstruct "realistic" block and jump counts from the partial profile
+//   generated at the first stage. In practice, we don't always produce proper
+//   profile data but the majority (e.g., >90%) of CFGs get the correct counts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Core/HashUtilities.h"
+#include "bolt/Profile/YAMLProfileReader.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/SampleProfileInference.h"
+
+#include <queue>
+
+using namespace llvm;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+cl::opt<bool>
+    InferStaleProfile("infer-stale-profile",
+                      cl::desc("Infer counts from stale profile data."),
+                      cl::init(false), cl::Hidden, cl::cat(BoltOptCategory));
+
+cl::opt<unsigned> StaleMatchingMaxFuncSize(
+    "stale-matching-max-func-size",
+    cl::desc("The maximum size of a function to consider for inference."),
+    cl::init(10000), cl::Hidden, cl::cat(BoltOptCategory));
+
+// Parameters of the profile inference algorithm. The default values are tuned
+// on several benchmarks.
+cl::opt<bool> StaleMatchingEvenFlowDistribution(
+    "stale-matching-even-flow-distribution",
+    cl::desc("Try to evenly distribute flow when there are multiple equally "
+             "likely options."),
+    cl::init(true), cl::ReallyHidden, cl::cat(BoltOptCategory));
+
+cl::opt<bool> StaleMatchingRebalanceUnknown(
+    "stale-matching-rebalance-unknown",
+    cl::desc("Evenly re-distribute flow among unknown subgraphs."),
+    cl::init(false), cl::ReallyHidden, cl::cat(BoltOptCategory));
+
+cl::opt<bool> StaleMatchingJoinIslands(
+    "stale-matching-join-islands",
+    cl::desc("Join isolated components having positive flow."), cl::init(true),
+    cl::ReallyHidden, cl::cat(BoltOptCategory));
+
+cl::opt<unsigned> StaleMatchingCostBlockInc(
+    "stale-matching-cost-block-inc",
+    cl::desc("The cost of increasing a block's count by one."), cl::init(110),
+    cl::ReallyHidden, cl::cat(BoltOptCategory));
+
+cl::opt<unsigned> StaleMatchingCostBlockDec(
+    "stale-matching-cost-block-dec",
+    cl::desc("The cost of decreasing a block's count by one."), cl::init(100),
+    cl::ReallyHidden, cl::cat(BoltOptCategory));
+
+cl::opt<unsigned> StaleMatchingCostBlockEntryInc(
+    "stale-matching-cost-block-entry-inc",
+    cl::desc("The cost of increasing the entry block's count by one."),
+    cl::init(110), cl::ReallyHidden, cl::cat(BoltOptCategory));
+
+cl::opt<unsigned> StaleMatchingCostBlockEntryDec(
+    "stale-matching-cost-block-entry-dec",
+    cl::desc("The cost of decreasing the entry block's count by one."),
+    cl::init(100), cl::ReallyHidden, cl::cat(BoltOptCategory));
+
+cl::opt<unsigned> StaleMatchingCostBlockZeroInc(
+    "stale-matching-cost-block-zero-inc",
+    cl::desc("The cost of increasing a count of zero-weight block by one."),
+    cl::init(10), cl::Hidden, cl::cat(BoltOptCategory));
+
+cl::opt<unsigned> StaleMatchingCostBlockUnknownInc(
+    "stale-matching-cost-block-unknown-inc",
+    cl::desc("The cost of increasing an unknown block's count by one."),
+    cl::init(10), cl::ReallyHidden, cl::cat(BoltOptCategory));
+
+cl::opt<unsigned> StaleMatchingCostJumpInc(
+    "stale-matching-cost-jump-inc",
+    cl::desc("The cost of increasing a jump's count by one."), cl::init(100),
+    cl::ReallyHidden, cl::cat(BoltOptCategory));
+
+cl::opt<unsigned> StaleMatchingCostJumpFTInc(
+    "stale-matching-cost-jump-ft-inc",
+    cl::desc("The cost of increasing a fall-through jump's count by one."),
+    cl::init(100), cl::ReallyHidden, cl::cat(BoltOptCategory));
+
+cl::opt<unsigned> StaleMatchingCostJumpDec(
+    "stale-matching-cost-jump-dec",
+    cl::desc("The cost of decreasing a jump's count by one."), cl::init(110),
+    cl::ReallyHidden, cl::cat(BoltOptCategory));
+
+cl::opt<unsigned> StaleMatchingCostJumpFTDec(
+    "stale-matching-cost-jump-ft-dec",
+    cl::desc("The cost of decreasing a fall-through jump's count by one."),
+    cl::init(110), cl::ReallyHidden, cl::cat(BoltOptCategory));
+
+cl::opt<unsigned> StaleMatchingCostJumpUnknownInc(
+    "stale-matching-cost-jump-unknown-inc",
+    cl::desc("The cost of increasing an unknown jump's count by one."),
+    cl::init(50), cl::ReallyHidden, cl::cat(BoltOptCategory));
+
+cl::opt<unsigned> StaleMatchingCostJumpUnknownFTInc(
+    "stale-matching-cost-jump-unknown-ft-inc",
+    cl::desc(
+        "The cost of increasing an unknown fall-through jump's count by one."),
+    cl::init(5), cl::ReallyHidden, cl::cat(BoltOptCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+/// Create a wrapper flow function to use with the profile inference algorithm,
+/// and initialize its jumps and metadata.
+FlowFunction
+createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) {
+  FlowFunction Func;
+
+  // Add a special "dummy" source so that there is always a unique entry point.
+  // Because of the extra source, for all other blocks in FlowFunction it holds
+  // that Block.Index == BB->getLayoutIndex() + 1
+  FlowBlock EntryBlock;
+  EntryBlock.Index = 0;
+  Func.Blocks.push_back(EntryBlock);
+
+  // Create FlowBlock for every basic block in the binary function
+  for (const BinaryBasicBlock *BB : BlockOrder) {
+    Func.Blocks.emplace_back();
+    FlowBlock &Block = Func.Blocks.back();
+    Block.Index = Func.Blocks.size() - 1;
+    (void)BB;
+    assert(Block.Index == BB->getLayoutIndex() + 1 &&
+           "incorrectly assigned basic block index");
+  }
+
+  // Create FlowJump for each jump between basic blocks in the binary function
+  std::vector<uint64_t> InDegree(Func.Blocks.size(), 0);
+  for (const BinaryBasicBlock *SrcBB : BlockOrder) {
+    std::unordered_set<const BinaryBasicBlock *> UniqueSuccs;
+    // Collect regular jumps
+    for (const BinaryBasicBlock *DstBB : SrcBB->successors()) {
+      // Ignoring parallel edges
+      if (UniqueSuccs.find(DstBB) != UniqueSuccs.end())
+        continue;
+
+      Func.Jumps.emplace_back();
+      FlowJump &Jump = Func.Jumps.back();
+      Jump.Source = SrcBB->getLayoutIndex() + 1;
+      Jump.Target = DstBB->getLayoutIndex() + 1;
+      InDegree[Jump.Target]++;
+      UniqueSuccs.insert(DstBB);
+    }
+    // Collect jumps to landing pads
+    for (const BinaryBasicBlock *DstBB : SrcBB->landing_pads()) {
+      // Ignoring parallel edges
+      if (UniqueSuccs.find(DstBB) != UniqueSuccs.end())
+        continue;
+
+      Func.Jumps.emplace_back();
+      FlowJump &Jump = Func.Jumps.back();
+      Jump.Source = SrcBB->getLayoutIndex() + 1;
+      Jump.Target = DstBB->getLayoutIndex() + 1;
+      InDegree[Jump.Target]++;
+      UniqueSuccs.insert(DstBB);
+    }
+  }
+
+  // Add dummy edges to the extra sources. If there are multiple entry blocks,
+  // add an unlikely edge from 0 to the subsequent ones
+  assert(InDegree[0] == 0 && "dummy entry blocks shouldn't have predecessors");
+  for (uint64_t I = 1; I < Func.Blocks.size(); I++) {
+    const BinaryBasicBlock *BB = BlockOrder[I - 1];
+    if (BB->isEntryPoint() || InDegree[I] == 0) {
+      Func.Jumps.emplace_back();
+      FlowJump &Jump = Func.Jumps.back();
+      Jump.Source = 0;
+      Jump.Target = I;
+      if (!BB->isEntryPoint())
+        Jump.IsUnlikely = true;
+    }
+  }
+
+  // Create necessary metadata for the flow function
+  for (FlowJump &Jump : Func.Jumps) {
+    Func.Blocks.at(Jump.Source).SuccJumps.push_back(&Jump);
+    Func.Blocks.at(Jump.Target).PredJumps.push_back(&Jump);
+  }
+  return Func;
+}
+
+/// Assign initial block/jump weights based on the stale profile data. The goal
+/// is to extract as much information from the stale profile as possible. Here
+/// we assume that each basic block is specified via a hash value computed from
+/// its content and the hashes of the unchanged basic blocks stay the same
+/// across different revisions of the binary.
+/// Whenever there is a count in the profile with the hash corresponding to one
+/// of the basic blocks in the binary, the count is "matched" to the block.
+/// Similarly, if both the source and the target of a count in the profile are
+/// matched to a jump in the binary, the count is recorded in CFG.
+void matchWeightsByHashes(const BinaryFunction::BasicBlockOrderType &BlockOrder,
+                          const yaml::bolt::BinaryFunctionProfile &YamlBF,
+                          FlowFunction &Func) {
+  assert(Func.Blocks.size() == BlockOrder.size() + 1);
+  // Initialize stale matcher
+  DenseMap<uint64_t, std::vector<FlowBlock *>> HashToBlocks;
+  for (uint64_t I = 0; I < BlockOrder.size(); I++) {
+    const BinaryBasicBlock *BB = BlockOrder[I];
+    assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock");
+    HashToBlocks[BB->getHash()].push_back(&Func.Blocks[I + 1]);
+  }
+
+  // Index in yaml profile => corresponding (matched) block
+  DenseMap<uint64_t, const FlowBlock *> MatchedBlocks;
+  // Match blocks from the profile to the blocks in CFG
+  for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) {
+    assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile");
+    auto It = HashToBlocks.find(YamlBB.Hash);
+    if (It != HashToBlocks.end()) {
+      const FlowBlock *MatchedBlock = It->second.front();
+      MatchedBlocks[YamlBB.Index] = MatchedBlock;
+    }
+  }
+
+  // Match jumps from the profile to the jumps from CFG
+  std::vector<uint64_t> OutWeight(Func.Blocks.size(), 0);
+  std::vector<uint64_t> InWeight(Func.Blocks.size(), 0);
+  for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) {
+    for (const yaml::bolt::SuccessorInfo &YamlSI : YamlBB.Successors) {
+      if (YamlSI.Count == 0)
+        continue;
+
+      // Try to find the jump for a given (src, dst) pair from the profile and
+      // assign the jump weight based on the profile count
+      const uint64_t SrcIndex = YamlBB.Index;
+      const uint64_t DstIndex = YamlSI.Index;
+
+      const FlowBlock *MatchedSrcBlock =
+          MatchedBlocks.find(SrcIndex) != MatchedBlocks.end()
+              ? MatchedBlocks[SrcIndex]
+              : nullptr;
+      const FlowBlock *MatchedDstBlock =
+          MatchedBlocks.find(DstIndex) != MatchedBlocks.end()
+              ? MatchedBlocks[DstIndex]
+              : nullptr;
+
+      if (MatchedSrcBlock != nullptr && MatchedDstBlock != nullptr) {
+        // Find a jump between the two blocks
+        FlowJump *Jump = nullptr;
+        for (FlowJump *SuccJump : MatchedSrcBlock->SuccJumps) {
+          if (SuccJump->Target == MatchedDstBlock->Index) {
+            Jump = SuccJump;
+            break;
+          }
+        }
+        // Assign the weight, if the corresponding jump is found
+        if (Jump != nullptr) {
+          Jump->Weight = YamlSI.Count;
+          Jump->HasUnknownWeight = false;
+        }
+      }
+      // Assign the weight for the src block, if it is found
+      if (MatchedSrcBlock != nullptr)
+        OutWeight[MatchedSrcBlock->Index] += YamlSI.Count;
+      // Assign the weight for the dst block, if it is found
+      if (MatchedDstBlock != nullptr)
+        InWeight[MatchedDstBlock->Index] += YamlSI.Count;
+    }
+  }
+
+  // Assign block counts based on in-/out- jumps
+  for (FlowBlock &Block : Func.Blocks) {
+    if (OutWeight[Block.Index] == 0 && InWeight[Block.Index] == 0) {
+      assert(Block.HasUnknownWeight && "unmatched block with positive count");
+      continue;
+    }
+    Block.HasUnknownWeight = false;
+    Block.Weight = std::max(OutWeight[Block.Index], InWeight[Block.Index]);
+  }
+}
+
+/// The function finds all blocks that are (i) reachable from the Entry block
+/// and (ii) do not have a path to an exit, and marks all such blocks 'cold'
+/// so that profi does not send any flow to such blocks.
+void preprocessUnreachableBlocks(FlowFunction &Func) {
+  const uint64_t NumBlocks = Func.Blocks.size();
+
+  // Start bfs from the source
+  std::queue<uint64_t> Queue;
+  std::vector<bool> VisitedEntry(NumBlocks, false);
+  for (uint64_t I = 0; I < NumBlocks; I++) {
+    FlowBlock &Block = Func.Blocks[I];
+    if (Block.isEntry()) {
+      Queue.push(I);
+      VisitedEntry[I] = true;
+      break;
+    }
+  }
+  while (!Queue.empty()) {
+    const uint64_t Src = Queue.front();
+    Queue.pop();
+    for (FlowJump *Jump : Func.Blocks[Src].SuccJumps) {
+      const uint64_t Dst = Jump->Target;
+      if (!VisitedEntry[Dst]) {
+        Queue.push(Dst);
+        VisitedEntry[Dst] = true;
+      }
+    }
+  }
+
+  // Start bfs from all sinks
+  std::vector<bool> VisitedExit(NumBlocks, false);
+  for (uint64_t I = 0; I < NumBlocks; I++) {
+    FlowBlock &Block = Func.Blocks[I];
+    if (Block.isExit() && VisitedEntry[I]) {
+      Queue.push(I);
+      VisitedExit[I] = true;
+    }
+  }
+  while (!Queue.empty()) {
+    const uint64_t Src = Queue.front();
+    Queue.pop();
+    for (FlowJump *Jump : Func.Blocks[Src].PredJumps) {
+      const uint64_t Dst = Jump->Source;
+      if (!VisitedExit[Dst]) {
+        Queue.push(Dst);
+        VisitedExit[Dst] = true;
+      }
+    }
+  }
+
+  // Make all blocks of zero weight so that flow is not sent
+  for (uint64_t I = 0; I < NumBlocks; I++) {
+    FlowBlock &Block = Func.Blocks[I];
+    if (Block.Weight == 0)
+      continue;
+    if (!VisitedEntry[I] || !VisitedExit[I]) {
+      Block.Weight = 0;
+      Block.HasUnknownWeight = true;
+      Block.IsUnlikely = true;
+      for (FlowJump *Jump : Block.SuccJumps) {
+        if (Jump->Source == Block.Index && Jump->Target == Block.Index) {
+          Jump->Weight = 0;
+          Jump->HasUnknownWeight = true;
+          Jump->IsUnlikely = true;
+        }
+      }
+    }
+  }
+}
+
+/// Decide if stale profile matching can be applied for a given function.
+/// Currently we skip inference for (very) large instances and for instances
+/// having "unexpected" control flow (e.g., having no sink basic blocks).
+bool canApplyInference(const FlowFunction &Func) {
+  if (Func.Blocks.size() > opts::StaleMatchingMaxFuncSize)
+    return false;
+
+  bool HasExitBlocks = llvm::any_of(
+      Func.Blocks, [&](const FlowBlock &Block) { return Block.isExit(); });
+  if (!HasExitBlocks)
+    return false;
+
+  return true;
+}
+
+/// Apply the profile inference algorithm for a given flow function.
+void applyInference(FlowFunction &Func) {
+  ProfiParams Params;
+  // Set the params from the command-line flags.
+  Params.EvenFlowDistribution = opts::StaleMatchingEvenFlowDistribution;
+  Params.RebalanceUnknown = opts::StaleMatchingRebalanceUnknown;
+  Params.JoinIslands = opts::StaleMatchingJoinIslands;
+
+  Params.CostBlockInc = opts::StaleMatchingCostBlockInc;
+  Params.CostBlockDec = opts::StaleMatchingCostBlockDec;
+  Params.CostBlockEntryInc = opts::StaleMatchingCostBlockEntryInc;
+  Params.CostBlockEntryDec = opts::StaleMatchingCostBlockEntryDec;
+  Params.CostBlockZeroInc = opts::StaleMatchingCostBlockZeroInc;
+  Params.CostBlockUnknownInc = opts::StaleMatchingCostBlockUnknownInc;
+
+  Params.CostJumpInc = opts::StaleMatchingCostJumpInc;
+  Params.CostJumpFTInc = opts::StaleMatchingCostJumpFTInc;
+  Params.CostJumpDec = opts::StaleMatchingCostJumpDec;
+  Params.CostJumpFTDec = opts::StaleMatchingCostJumpFTDec;
+  Params.CostJumpUnknownInc = opts::StaleMatchingCostJumpUnknownInc;
+  Params.CostJumpUnknownFTInc = opts::StaleMatchingCostJumpUnknownFTInc;
+
+  applyFlowInference(Params, Func);
+}
+
+/// Collect inferred counts from the flow function and update annotations in
+/// the binary function.
+void assignProfile(BinaryFunction &BF,
+                   const BinaryFunction::BasicBlockOrderType &BlockOrder,
+                   FlowFunction &Func) {
+  BinaryContext &BC = BF.getBinaryContext();
+
+  assert(Func.Blocks.size() == BlockOrder.size() + 1);
+  for (uint64_t I = 0; I < BlockOrder.size(); I++) {
+    FlowBlock &Block = Func.Blocks[I + 1];
+    BinaryBasicBlock *BB = BlockOrder[I];
+
+    // Update block's count
+    BB->setExecutionCount(Block.Flow);
+
+    // Update jump counts: (i) clean existing counts and then (ii) set new ones
+    auto BI = BB->branch_info_begin();
+    for (const BinaryBasicBlock *DstBB : BB->successors()) {
+      (void)DstBB;
+      BI->Count = 0;
+      BI->MispredictedCount = 0;
+      ++BI;
+    }
+    for (FlowJump *Jump : Block.SuccJumps) {
+      if (Jump->IsUnlikely)
+        continue;
+      if (Jump->Flow == 0)
+        continue;
+
+      BinaryBasicBlock &SuccBB = *BlockOrder[Jump->Target - 1];
+      // Check if the edge corresponds to a regular jump or a landing pad
+      if (BB->getSuccessor(SuccBB.getLabel())) {
+        BinaryBasicBlock::BinaryBranchInfo &BI = BB->getBranchInfo(SuccBB);
+        BI.Count += Jump->Flow;
+      } else {
+        BinaryBasicBlock *LP = BB->getLandingPad(SuccBB.getLabel());
+        if (LP && LP->getKnownExecutionCount() < Jump->Flow)
+          LP->setExecutionCount(Jump->Flow);
+      }
+    }
+
+    // Update call-site annotations
+    auto setOrUpdateAnnotation = [&](MCInst &Instr, StringRef Name,
+                                     uint64_t Count) {
+      if (BC.MIB->hasAnnotation(Instr, Name))
+        BC.MIB->removeAnnotation(Instr, Name);
+      // Do not add zero-count annotations
+      if (Count == 0)
+        return;
+      BC.MIB->addAnnotation(Instr, Name, Count);
+    };
+
+    for (MCInst &Instr : *BB) {
+      // Ignore pseudo instructions
+      if (BC.MIB->isPseudo(Instr))
+        continue;
+      // Ignore jump tables
+      const MCInst *LastInstr = BB->getLastNonPseudoInstr();
+      if (BC.MIB->getJumpTable(*LastInstr) && LastInstr == &Instr)
+        continue;
+
+      if (BC.MIB->isIndirectCall(Instr) || BC.MIB->isIndirectBranch(Instr)) {
+        auto &ICSP = BC.MIB->getOrCreateAnnotationAs<IndirectCallSiteProfile>(
+            Instr, "CallProfile");
+        if (!ICSP.empty()) {
+          // Try to evenly distribute the counts among the call sites
+          const uint64_t TotalCount = Block.Flow;
+          const uint64_t NumSites = ICSP.size();
+          for (uint64_t Idx = 0; Idx < ICSP.size(); Idx++) {
+            IndirectCallProfile &CSP = ICSP[Idx];
+            uint64_t CountPerSite = TotalCount / NumSites;
+            // When counts cannot be exactly distributed, increase by 1 the
+            // counts of the first (TotalCount % NumSites) call sites
+            if (Idx < TotalCount % NumSites)
+              CountPerSite++;
+            CSP.Count = CountPerSite;
+          }
+        } else {
+          ICSP.emplace_back(nullptr, Block.Flow, 0);
+        }
+      } else if (BC.MIB->getConditionalTailCall(Instr)) {
+        // We don't know exactly the number of times the conditional tail call
+        // is executed; conservatively, setting it to the count of the block
+        setOrUpdateAnnotation(Instr, "CTCTakenCount", Block.Flow);
+        BC.MIB->removeAnnotation(Instr, "CTCMispredCount");
+      } else if (BC.MIB->isCall(Instr)) {
+        setOrUpdateAnnotation(Instr, "Count", Block.Flow);
+      }
+    }
+  }
+
+  // Update function's execution count and mark the function inferred.
+  BF.setExecutionCount(Func.Blocks[0].Flow);
+  BF.setHasInferredProfile(true);
+}
+
+bool YAMLProfileReader::inferStaleProfile(
+    BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF) {
+  // Make sure that block indices and hashes are up to date
+  BF.getLayout().updateLayoutIndices();
+  BF.computeBlockHashes();
+
+  const BinaryFunction::BasicBlockOrderType BlockOrder(
+      BF.getLayout().block_begin(), BF.getLayout().block_end());
+
+  // Create a wrapper flow function to use with the profile inference algorithm
+  FlowFunction Func = createFlowFunction(BlockOrder);
+
+  // Match as many block/jump counts from the stale profile as possible
+  matchWeightsByHashes(BlockOrder, YamlBF, Func);
+
+  // Adjust the flow function by marking unreachable blocks Unlikely so that
+  // they don't get any counts assigned
+  preprocessUnreachableBlocks(Func);
+
+  // Check if profile inference can be applied for the instance
+  if (!canApplyInference(Func))
+    return false;
+
+  // Apply the profile inference algorithm
+  applyInference(Func);
+
+  // Collect inferred counts and update function annotations
+  assignProfile(BF, BlockOrder, Func);
+
+  // As of now, we always mark the binary function having "correct" profile.
+  // In the future, we may discard the results for instances with poor inference
+  // metrics and keep such functions un-optimized.
+  return true;
+}
+
+} // end namespace bolt
+} // end namespace llvm
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp

index 0e4ab57..3fb86c0 100644 (file)
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -20,6 +20,7 @@ namespace opts {
  
  extern cl::opt<unsigned> Verbosity;
  extern cl::OptionCategory BoltOptCategory;
+extern cl::opt<bool> InferStaleProfile;
  
  static llvm::cl::opt<bool>
      IgnoreHash("profile-ignore-hash",
@@ -239,6 +240,16 @@ bool YAMLProfileReader::parseFunctionProfile(
             << MismatchedCalls << " calls, and " << MismatchedEdges
             << " edges in profile did not match function " << BF << '\n';
  
+  if (!ProfileMatched && opts::InferStaleProfile) {
+    if (opts::Verbosity >= 1)
+      outs() << "BOLT-INFO: applying profile inference for "
+             << "\"" << BF.getPrintName() << "\"\n";
+    if (inferStaleProfile(BF, YamlBF)) {
+      ProfileMatched = true;
+      BF.markProfiled(YamlBP.Header.Flags);
+    }
+  }
+
    return ProfileMatched;
  }
  
diff --git a/bolt/test/X86/Inputs/blarge_profile_stale.yaml b/bolt/test/X86/Inputs/blarge_profile_stale.yaml

new file mode 100644 (file)

index 0000000..62a4bbc
--- /dev/null
+++ b/bolt/test/X86/Inputs/blarge_profile_stale.yaml
@@ -0,0 +1,51 @@
+---
+header:
+  profile-version: 1
+  binary-name:     'reader-yaml.test.tmp.exe'
+  binary-build-id: '<unknown>'
+  profile-flags:   [ lbr ]
+  profile-origin:  branch profile reader
+  profile-events:  ''
+functions:
+  - name:            SolveCubic
+    fid:             6
+    hash:            0xC6E9098E973BBE19
+    exec:            151
+    nblocks:         18
+    blocks:
+      - bid:             0
+        insns:           43
+        hash:            0xD2411AC186118199
+        exec:            151
+        succ:            [ { bid: 1, cnt: 4, mis: 2 }, { bid: 11, cnt: 0 } ]
+      - bid:             1
+        insns:           7
+        hash:            0xDF0C9CC1FEAA70C3
+        succ:            [ { bid: 10, cnt: 0 }, { bid: 2, cnt: 0 } ]
+      - bid:             13
+        insns:           26
+        hash:            0xF05DC5524E99E56F
+        succ:            [ { bid: 15, cnt: 89 }, { bid: 14, cnt: 0 } ]
+      - bid:             15
+        insns:           9
+        hash:            0xB2E8338276A9834E
+  - name:            usqrt
+    fid:             7
+    hash:            0x8B62B1F9AD81EA35
+    exec:            20
+    nblocks:         6
+    blocks:
+      - bid:             0
+        insns:           4
+        hash:            0xE3FEB842A6548CCF
+        exec:            20
+        succ:            [ { bid: 1, cnt: 0 } ]
+      - bid:             1
+        insns:           9
+        hash:            0x85948FF2924613B7
+        succ:            [ { bid: 3, cnt: 320, mis: 171 }, { bid: 2, cnt: 0 } ]
+      - bid:             3
+        insns:           2
+        hash:            0x41D8DB2D2B01F411
+        succ:            [ { bid: 1, cnt: 300, mis: 33 }, { bid: 4, cnt: 20 } ]
+...
diff --git a/bolt/test/X86/reader-stale-yaml.test b/bolt/test/X86/reader-stale-yaml.test

new file mode 100644 (file)

index 0000000..47b25fc
--- /dev/null
+++ b/bolt/test/X86/reader-stale-yaml.test
@@ -0,0 +1,38 @@
+# This script checks that YamlProfileReader in llvm-bolt is reading data
+# correctly and stale data is corrected.
+
+RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe
+RUN: llvm-bolt %t.exe -o /dev/null --b %p/Inputs/blarge_profile_stale.yaml --print-cfg --print-only=usqrt --infer-stale-profile=1 --profile-ignore-hash=1 \
+RUN:   2>&1 | FileCheck %s -check-prefix=CHECK
+
+# Verify that yaml reader works as expected.
+CHECK:  pre-processing profile using YAML profile reader
+
+# Verify the inferred counts of "usqrt" that has stale profile:
+#   - the function has nblocks=6 in the profile, which makes it stale
+#   - block with bid=0 has an incorrect (missing) count, which is inferred
+CHECK:    Binary Function "usqrt" after building cfg {
+CHECK:      State       : CFG constructed
+CHECK:      Address     : 0x401170
+CHECK:      Size        : 0x43
+CHECK:      Section     : .text
+CHECK:      IsSimple    : 1
+CHECK:      BB Count    : 5
+CHECK:      Exec Count  : 20
+CHECK:      Branch Count: 640
+CHECK:    }
+
+# Verify block counts.
+CHECK:    .LBB01 (4 instructions, align : 1)
+CHECK:      Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 20)
+CHECK:    .Ltmp[[#BB13:]] (9 instructions, align : 1)
+CHECK:      Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 320), .LFT[[#BB0:]] (mispreds: 0, count: 0)
+CHECK:    .LFT[[#BB0:]] (2 instructions, align : 1)
+CHECK:      Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 0)
+CHECK:    .Ltmp[[#BB12:]] (2 instructions, align : 1)
+CHECK:      Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 300), .LFT[[#BB1:]] (mispreds: 0, count: 20)
+CHECK:    .LFT[[#BB1:]] (2 instructions, align : 1)
+
+# Check the overal inference stats.
+CHECK:  2 out of 7 functions in the binary (28.6%) have non-empty execution profile
+CHECK:  inferred profile for 1 (50.00% of profiled, 100.00% of stale) functions responsible for 87.31% samples (640 out of 733)
author	spupyrev <spupyrev@fb.com>
	Thu, 16 Feb 2023 18:52:04 +0000 (10:52 -0800)
committer	spupyrev <spupyrev@fb.com>
	Tue, 6 Jun 2023 19:13:52 +0000 (12:13 -0700)
bolt/include/bolt/Core/BinaryFunction.h		patch \| blob \| history
bolt/include/bolt/Profile/YAMLProfileReader.h		patch \| blob \| history
bolt/lib/Passes/BinaryPasses.cpp		patch \| blob \| history
bolt/lib/Profile/CMakeLists.txt		patch \| blob \| history
bolt/lib/Profile/StaleProfileMatching.cpp	[new file with mode: 0644]	patch \| blob
bolt/lib/Profile/YAMLProfileReader.cpp		patch \| blob \| history
bolt/test/X86/Inputs/blarge_profile_stale.yaml	[new file with mode: 0644]	patch \| blob
bolt/test/X86/reader-stale-yaml.test	[new file with mode: 0644]	patch \| blob