From 2316a10fe59a7839b705dace0724529fd22d9f99 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Wed, 22 Mar 2023 14:18:03 -0700 Subject: [PATCH] [BOLT] stale profile matching [part 2 out of 2] MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This is a first "serious" version of stale profile matching in BOLT. This diff extends the hash computation for basic blocks so that we can apply a fuzzy hash-based matching. The idea is to compute several "versions" of a hash value for a basic block. A loose version of a hash (computed by ignoring instruction operands) allows to match blocks in functions whose content has been changed, while stricter hash values (considering instruction opcodes with operands and even based on hashes of block's successors/predecessors) allow to resolve collisions. In order to save space and build time, individual hash components are blended into a single uint64_t. There are likely numerous ways of improving hash computation but already this simple variant provides significant perf benefits. **Perf testing** on the clang binary: collecting data on clang-10 and using it to optimize clang-11 (with ~1 year of commits in between). Next, we compare - //stale_clang// (clang-11 optimized with profile collected on clang-10 with **infer-stale-profile=0**) - //opt_clang// (clang-11 optimized with profile collected on clang-11) - //infer_clang// (clang-11 optimized with profile collected on clang-10 with **infer-stale-profile=1**) `LTO-only` mode: //stale_clang// vs //opt_clang//: task-clock [delta(%): 9.4252 ± 1.6582, p-value: 0.000002] (That is, there is a ~9.5% perf regression) //infer_clang// vs //opt_clang//: task-clock [delta(%): 2.1834 ± 1.8158, p-value: 0.040702] (That is, the regression is reduced to ~2%) Related BOLT logs: ``` BOLT-INFO: identified 2114 (18.61%) stale functions responsible for 30.96% samples BOLT-INFO: inferred profile for 2101 (18.52% of all profiled) functions responsible for 30.95% samples ``` `LTO+AutoFDO` mode: //stale_clang// vs //opt_clang//: task-clock [delta(%): 19.1293 ± 1.4131, p-value: 0.000002] //infer_clang// vs //opt_clang//: task-clock [delta(%): 7.4364 ± 1.3343, p-value: 0.000002] Related BOLT logs: ``` BOLT-INFO: identified 5452 (50.27%) stale functions responsible for 85.34% samples BOLT-INFO: inferred profile for 5442 (50.23% of all profiled) functions responsible for 85.33% samples ``` Reviewed By: Amir Differential Revision: https://reviews.llvm.org/D146661 --- bolt/lib/Core/BinaryFunction.cpp | 8 - bolt/lib/Profile/StaleProfileMatching.cpp | 200 ++++++++++++++++++++++++- bolt/test/X86/Inputs/blarge_profile_stale.yaml | 6 +- 3 files changed, 197 insertions(+), 17 deletions(-) diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index cef35f1..d296d11 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -3611,14 +3611,6 @@ size_t BinaryFunction::computeHash(bool UseDFS, return Hash = std::hash{}(HashString); } -void BinaryFunction::computeBlockHashes() const { - for (const BinaryBasicBlock *BB : BasicBlocks) { - std::string Hash = - hashBlock(BC, *BB, [](const MCOperand &Op) { return std::string(); }); - BB->setHash(std::hash{}(Hash)); - } -} - void BinaryFunction::insertBasicBlocks( BinaryBasicBlock *Start, std::vector> &&NewBBs, diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp index b6c18c2..f7bd193 100644 --- a/bolt/lib/Profile/StaleProfileMatching.cpp +++ b/bolt/lib/Profile/StaleProfileMatching.cpp @@ -33,6 +33,9 @@ #include +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt-prof" + using namespace llvm; namespace opts { @@ -133,6 +136,176 @@ cl::opt StaleMatchingCostJumpUnknownFTInc( namespace llvm { namespace bolt { +/// An object wrapping several components of a basic block hash. The combined +/// (blended) hash is represented and stored as one uint64_t, while individual +/// components are of smaller size (e.g., uint16_t or uint8_t). +struct BlendedBlockHash { +private: + static uint64_t combineHashes(uint16_t Hash1, uint16_t Hash2, uint16_t Hash3, + uint16_t Hash4) { + uint64_t Hash = 0; + + Hash |= uint64_t(Hash4); + Hash <<= 16; + + Hash |= uint64_t(Hash3); + Hash <<= 16; + + Hash |= uint64_t(Hash2); + Hash <<= 16; + + Hash |= uint64_t(Hash1); + + return Hash; + } + + static void parseHashes(uint64_t Hash, uint16_t &Hash1, uint16_t &Hash2, + uint16_t &Hash3, uint16_t &Hash4) { + Hash1 = Hash & 0xffff; + Hash >>= 16; + + Hash2 = Hash & 0xffff; + Hash >>= 16; + + Hash3 = Hash & 0xffff; + Hash >>= 16; + + Hash4 = Hash & 0xffff; + Hash >>= 16; + } + +public: + explicit BlendedBlockHash() {} + + explicit BlendedBlockHash(uint64_t CombinedHash) { + parseHashes(CombinedHash, Offset, OpcodeHash, InstrHash, NeighborHash); + } + + /// Combine the blended hash into uint64_t. + uint64_t combine() const { + return combineHashes(Offset, OpcodeHash, InstrHash, NeighborHash); + } + + /// Compute a distance between two given blended hashes. The smaller the + /// distance, the more similar two blocks are. For identical basic blocks, + /// the distance is zero. + uint64_t distance(const BlendedBlockHash &BBH) const { + assert(OpcodeHash == BBH.OpcodeHash && + "incorrect blended hash distance computation"); + uint64_t Dist = 0; + // Account for NeighborHash + Dist += NeighborHash == BBH.NeighborHash ? 0 : 1; + Dist <<= 16; + // Account for InstrHash + Dist += InstrHash == BBH.InstrHash ? 0 : 1; + Dist <<= 16; + // Account for Offset + Dist += (Offset >= BBH.Offset ? Offset - BBH.Offset : BBH.Offset - Offset); + return Dist; + } + + /// The offset of the basic block from the function start. + uint16_t Offset{0}; + /// (Loose) Hash of the basic block instructions, excluding operands. + uint16_t OpcodeHash{0}; + /// (Strong) Hash of the basic block instructions, including opcodes and + /// operands. + uint16_t InstrHash{0}; + /// Hash of the (loose) basic block together with (loose) hashes of its + /// successors and predecessors. + uint16_t NeighborHash{0}; +}; + +/// The object is used to identify and match basic blocks in a BinaryFunction +/// given their hashes computed on a binary built from several revisions behind +/// release. +class StaleMatcher { +public: + /// Initialize stale matcher. + void init(const std::vector &Blocks, + const std::vector &Hashes) { + assert(Blocks.size() == Hashes.size() && + "incorrect matcher initialization"); + for (size_t I = 0; I < Blocks.size(); I++) { + FlowBlock *Block = Blocks[I]; + uint16_t OpHash = Hashes[I].OpcodeHash; + OpHashToBlocks[OpHash].push_back(std::make_pair(Hashes[I], Block)); + } + } + + /// Find the most similar block for a given hash. + const FlowBlock *matchBlock(BlendedBlockHash BlendedHash) const { + auto BlockIt = OpHashToBlocks.find(BlendedHash.OpcodeHash); + if (BlockIt == OpHashToBlocks.end()) { + return nullptr; + } + FlowBlock *BestBlock = nullptr; + uint64_t BestDist = std::numeric_limits::max(); + for (auto It : BlockIt->second) { + FlowBlock *Block = It.second; + BlendedBlockHash Hash = It.first; + uint64_t Dist = Hash.distance(BlendedHash); + if (BestBlock == nullptr || Dist < BestDist) { + BestDist = Dist; + BestBlock = Block; + } + } + return BestBlock; + } + +private: + using HashBlockPairType = std::pair; + std::unordered_map> OpHashToBlocks; +}; + +void BinaryFunction::computeBlockHashes() const { + if (size() == 0) + return; + + assert(hasCFG() && "the function is expected to have CFG"); + + std::vector BlendedHashes(BasicBlocks.size()); + std::vector OpcodeHashes(BasicBlocks.size()); + // Initialize hash components + for (size_t I = 0; I < BasicBlocks.size(); I++) { + const BinaryBasicBlock *BB = BasicBlocks[I]; + assert(BB->getIndex() == I && "incorrect block index"); + BlendedHashes[I].Offset = BB->getOffset(); + // Hashing complete instructions + std::string InstrHashStr = hashBlock( + BC, *BB, [&](const MCOperand &Op) { return hashInstOperand(BC, Op); }); + uint64_t InstrHash = std::hash{}(InstrHashStr); + BlendedHashes[I].InstrHash = hash_64_to_16(InstrHash); + // Hashing opcodes + std::string OpcodeHashStr = + hashBlock(BC, *BB, [](const MCOperand &Op) { return std::string(); }); + OpcodeHashes[I] = std::hash{}(OpcodeHashStr); + BlendedHashes[I].OpcodeHash = hash_64_to_16(OpcodeHashes[I]); + } + + // Initialize neighbor hash + for (size_t I = 0; I < BasicBlocks.size(); I++) { + const BinaryBasicBlock *BB = BasicBlocks[I]; + uint64_t Hash = OpcodeHashes[I]; + // Append hashes of successors + for (BinaryBasicBlock *SuccBB : BB->successors()) { + uint64_t SuccHash = OpcodeHashes[SuccBB->getIndex()]; + Hash = hashing::detail::hash_16_bytes(Hash, SuccHash); + } + // Append hashes of predecessors + for (BinaryBasicBlock *PredBB : BB->predecessors()) { + uint64_t PredHash = OpcodeHashes[PredBB->getIndex()]; + Hash = hashing::detail::hash_16_bytes(Hash, PredHash); + } + BlendedHashes[I].NeighborHash = hash_64_to_16(Hash); + } + + // Assign hashes + for (size_t I = 0; I < BasicBlocks.size(); I++) { + const BinaryBasicBlock *BB = BasicBlocks[I]; + BB->setHash(BlendedHashes[I].combine()); + } +} /// Create a wrapper flow function to use with the profile inference algorithm, /// and initialize its jumps and metadata. FlowFunction @@ -224,23 +397,38 @@ void matchWeightsByHashes(const BinaryFunction::BasicBlockOrderType &BlockOrder, const yaml::bolt::BinaryFunctionProfile &YamlBF, FlowFunction &Func) { assert(Func.Blocks.size() == BlockOrder.size() + 1); - // Initialize stale matcher - DenseMap> HashToBlocks; + + std::vector Blocks; + std::vector BlendedHashes; for (uint64_t I = 0; I < BlockOrder.size(); I++) { const BinaryBasicBlock *BB = BlockOrder[I]; assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock"); - HashToBlocks[BB->getHash()].push_back(&Func.Blocks[I + 1]); + Blocks.push_back(&Func.Blocks[I + 1]); + BlendedBlockHash BlendedHash(BB->getHash()); + BlendedHashes.push_back(BlendedHash); + LLVM_DEBUG(dbgs() << "BB with index " << I << " has hash = " + << Twine::utohexstr(BB->getHash()) << "\n"); } + StaleMatcher Matcher; + Matcher.init(Blocks, BlendedHashes); // Index in yaml profile => corresponding (matched) block DenseMap MatchedBlocks; // Match blocks from the profile to the blocks in CFG for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) { assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile"); - auto It = HashToBlocks.find(YamlBB.Hash); - if (It != HashToBlocks.end()) { - const FlowBlock *MatchedBlock = It->second.front(); + BlendedBlockHash BlendedHash(YamlBB.Hash); + const FlowBlock *MatchedBlock = Matcher.matchBlock(BlendedHash); + if (MatchedBlock != nullptr) { MatchedBlocks[YamlBB.Index] = MatchedBlock; + LLVM_DEBUG(dbgs() << "Matched yaml block with bid = " << YamlBB.Index + << " and hash = " << Twine::utohexstr(YamlBB.Hash) + << " to BB with index = " << MatchedBlock->Index - 1 + << "\n"); + } else { + LLVM_DEBUG( + dbgs() << "Couldn't match yaml block with bid = " << YamlBB.Index + << " and hash = " << Twine::utohexstr(YamlBB.Hash) << "\n"); } } diff --git a/bolt/test/X86/Inputs/blarge_profile_stale.yaml b/bolt/test/X86/Inputs/blarge_profile_stale.yaml index 62a4bbc..afe76ed 100644 --- a/bolt/test/X86/Inputs/blarge_profile_stale.yaml +++ b/bolt/test/X86/Inputs/blarge_profile_stale.yaml @@ -37,15 +37,15 @@ functions: blocks: - bid: 0 insns: 4 - hash: 0xE3FEB842A6548CCF + hash: 0xb1e5b76571270000 exec: 20 succ: [ { bid: 1, cnt: 0 } ] - bid: 1 insns: 9 - hash: 0x85948FF2924613B7 + hash: 0x587e93788b970010 succ: [ { bid: 3, cnt: 320, mis: 171 }, { bid: 2, cnt: 0 } ] - bid: 3 insns: 2 - hash: 0x41D8DB2D2B01F411 + hash: 0x20e605d745e50039 succ: [ { bid: 1, cnt: 300, mis: 33 }, { bid: 4, cnt: 20 } ] ... -- 2.7.4