From a993720397ea316ac2866d2354d6fe6b4e97169a Mon Sep 17 00:00:00 2001 From: Vedant Kumar Date: Thu, 20 Feb 2020 18:23:01 -0800 Subject: [PATCH] [LiveDebugValues] Encode register location within VarLoc IDs [3/3] This is part 3 of a 3-part series to address a compile-time explosion issue in LiveDebugValues. --- Start encoding register locations within VarLoc IDs, and take advantage of this encoding to speed up transferRegisterDef. There is no fundamental algorithmic change: this patch simply swaps out SparseBitVector in favor of CoalescingBitVector. That changes iteration order (hence the test updates), but otherwise this patch is NFCI. The only interesting change is in transferRegisterDef. Instead of doing: ``` KillSet = {} for (ID : OpenRanges.getVarLocs()) if (DeadRegs.count(ID)) KillSet.add(ID) ``` We now do: ``` KillSet = {} for (Reg : DeadRegs) for (ID : intervalsReservedForReg(Reg, OpenRanges.getVarLocs())) KillSet.add(ID) ``` By not visiting each open location every time we visit an instruction, this eliminates some potentially quadratic behavior. The new implementation basically does a constant amount of work per instruction because the interval map lookups are very fast. For a file in WebKit, this brings the time spent in LiveDebugValues down from ~2.5 minutes to 4 seconds, reducing compile time spent in that pass from 28% of the total to just over 1%. Before: ``` 2.49 min 27.8% 0 s LiveDebugValues::process 2.41 min 27.0% 5.40 s LiveDebugValues::transferRegisterDef 1.51 min 16.9% 1.51 min LiveDebugValues::VarLoc::isDescribedByReg() const 32.73 s 6.1% 8.70 s llvm::SparseBitVector<128u>::SparseBitVectorIterator::operator++() ``` After: ``` 4.53 s 1.1% 0 s LiveDebugValues::process 3.00 s 0.7% 107.00 ms LiveDebugValues::transferRegisterCopy 892.00 ms 0.2% 406.00 ms LiveDebugValues::transferSpillOrRestoreInst 404.00 ms 0.1% 32.00 ms LiveDebugValues::transferRegisterDef 110.00 ms 0.0% 2.00 ms LiveDebugValues::getUsedRegs 57.00 ms 0.0% 1.00 ms std::__1::vector<>::push_back 40.00 ms 0.0% 1.00 ms llvm::CoalescingBitVector<>::find(unsigned long long) ``` FWIW, I tried the same approach using SparseBitVector, but got bad results. To do that, I had to extend SparseBitVector to support 64-bit indices and expose its lower bound operation. The problem with this is that the performance is very hard to predict: SparseBitVector's lower bound operation falls back to O(n) linear scans in a std::list if you're not /very/ careful about managing iteration order. When I profiled this the performance looked worse than the baseline. You can see the full CoalescingBitVector-based implementation here: https://github.com/vedantk/llvm-project/commits/try-coalescing You can see the full SparseBitVector-based implementation here: https://github.com/vedantk/llvm-project/commits/try-sparsebitvec-find Depends on D74984 and D74985. Differential Revision: https://reviews.llvm.org/D74986 --- llvm/lib/CodeGen/LiveDebugValues.cpp | 227 ++++++++++++++------- .../DebugInfo/MIR/X86/entry-values-diamond-bbs.mir | 5 +- .../MIR/X86/multiple-param-dbg-value-entry.mir | 2 +- 3 files changed, 163 insertions(+), 71 deletions(-) diff --git a/llvm/lib/CodeGen/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues.cpp index f294969..ad54378 100644 --- a/llvm/lib/CodeGen/LiveDebugValues.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues.cpp @@ -26,12 +26,12 @@ /// //===----------------------------------------------------------------------===// +#include "llvm/ADT/CoalescingBitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/UniqueVector.h" #include "llvm/CodeGen/LexicalScopes.h" @@ -111,6 +111,7 @@ static bool isRegOtherThanSPAndFP(const MachineOperand &Op, namespace { using DefinedRegsSet = SmallSet; +using VarLocSet = CoalescingBitVector; /// A type-checked pair of {Register Location (or 0), Index}, used to index /// into a \ref VarLocMap. This can be efficiently converted to a 64-bit int @@ -120,6 +121,10 @@ using DefinedRegsSet = SmallSet; /// Why encode a location /into/ the VarLocMap index? This makes it possible /// to find the open VarLocs killed by a register def very quickly. This is a /// performance-critical operation for LiveDebugValues. +/// +/// TODO: Consider adding reserved intervals for kinds of VarLocs other than +/// RegisterKind, like SpillLocKind or EntryValueKind, to optimize iteration +/// over open locations. struct LocIndex { uint32_t Location; // Physical registers live in the range [1;2^30) (see // \ref MCRegister), so we have plenty of range left here @@ -136,6 +141,12 @@ struct LocIndex { static LocIndex fromRawInteger(uint64_t ID) { return {static_cast(ID >> 32), static_cast(ID)}; } + + /// Get the start of the interval reserved for VarLocs of kind RegisterKind + /// which reside in \p Reg. The end is at rawIndexForReg(Reg+1)-1. + static uint64_t rawIndexForReg(uint32_t Reg) { + return LocIndex(Reg, 0).getAsRawInteger(); + } }; class LiveDebugValues : public MachineFunctionPass { @@ -145,6 +156,7 @@ private: const TargetFrameLowering *TFI; BitVector CalleeSavedRegs; LexicalScopes LS; + VarLocSet::Allocator Alloc; enum struct TransferKind { TransferCopy, TransferSpill, TransferRestore }; @@ -433,23 +445,44 @@ private: } }; + /// VarLocMap is used for two things: + /// 1) Assigning a unique LocIndex to a VarLoc. This LocIndex can be used to + /// virtually insert a VarLoc into a VarLocSet. + /// 2) Given a LocIndex, look up the unique associated VarLoc. class VarLocMap { - UniqueVector VarLocs; + /// Map a VarLoc to an index within the vector reserved for its location + /// within Loc2Vars. + std::map Var2Index; + + /// Map a location to a vector which holds VarLocs which live in that + /// location. + SmallDenseMap> Loc2Vars; public: + /// Retrieve a unique LocIndex for \p VL. LocIndex insert(const VarLoc &VL) { - uint32_t Index = VarLocs.insert(VL); - return {0, Index}; + uint32_t Location = VL.isDescribedByReg(); + uint32_t &Index = Var2Index[VL]; + if (!Index) { + auto &Vars = Loc2Vars[Location]; + Vars.push_back(VL); + Index = Vars.size(); + } + return {Location, Index - 1}; } - const VarLoc &operator[](LocIndex ID) const { return VarLocs[ID.Index]; } + /// Retrieve the unique VarLoc associated with \p ID. + const VarLoc &operator[](LocIndex ID) const { + auto LocIt = Loc2Vars.find(ID.Location); + assert(LocIt != Loc2Vars.end() && "Location not tracked"); + return LocIt->second[ID.Index]; + } }; - using VarLocSet = SparseBitVector<>; using VarLocInMBB = SmallDenseMap; struct TransferDebugPair { - MachineInstr *TransferInst; /// Instruction where this transfer occurs. - LocIndex LocationID; /// Location number for the transfer dest. + MachineInstr *TransferInst; ///< Instruction where this transfer occurs. + LocIndex LocationID; ///< Location number for the transfer dest. }; using TransferMap = SmallVector; @@ -484,7 +517,8 @@ private: OverlapMap &OverlappingFragments; public: - OpenRangesSet(OverlapMap &_OLapMap) : OverlappingFragments(_OLapMap) {} + OpenRangesSet(VarLocSet::Allocator &Alloc, OverlapMap &_OLapMap) + : VarLocs(Alloc), OverlappingFragments(_OLapMap) {} const VarLocSet &getVarLocs() const { return VarLocs; } @@ -525,6 +559,28 @@ private: } }; + /// Collect all VarLoc IDs from \p CollectFrom for VarLocs which are located + /// in \p Reg, of kind RegisterKind. Insert collected IDs in \p Collected. + void collectIDsForReg(VarLocSet &Collected, uint32_t Reg, + const VarLocSet &CollectFrom) const; + + /// Get the registers which are used by VarLocs of kind RegisterKind tracked + /// by \p CollectFrom. + void getUsedRegs(const VarLocSet &CollectFrom, + SmallVectorImpl &UsedRegs) const; + + VarLocSet &getVarLocsInMBB(const MachineBasicBlock *MBB, VarLocInMBB &Locs) { + auto Result = Locs.try_emplace(MBB, Alloc); + return Result.first->second; + } + + const VarLocSet &getVarLocsInMBB(const MachineBasicBlock *MBB, + const VarLocInMBB &Locs) const { + auto It = Locs.find(MBB); + assert(It != Locs.end() && "MBB not in map"); + return It->second; + } + /// Tests whether this instruction is a spill to a stack location. bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF); @@ -566,7 +622,7 @@ private: VarLocMap &VarLocIDs, const VarLoc &EntryVL); void emitEntryValues(MachineInstr &MI, OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs, TransferMap &Transfers, - SparseBitVector<> &KillSet); + VarLocSet &KillSet); void recordEntryValue(const MachineInstr &MI, const DefinedRegsSet &DefinedRegs, OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs); @@ -711,6 +767,40 @@ LiveDebugValues::OpenRangesSet::getEntryValueBackup(DebugVariable Var) { return llvm::None; } +void LiveDebugValues::collectIDsForReg(VarLocSet &Collected, uint32_t Reg, + const VarLocSet &CollectFrom) const { + // The half-open interval [FirstIndexForReg, FirstInvalidIndex) contains all + // possible VarLoc IDs for VarLocs of kind RegisterKind which live in Reg. + uint64_t FirstIndexForReg = LocIndex::rawIndexForReg(Reg); + uint64_t FirstInvalidIndex = LocIndex::rawIndexForReg(Reg + 1); + // Iterate through that half-open interval and collect all the set IDs. + for (auto It = CollectFrom.find(FirstIndexForReg), End = CollectFrom.end(); + It != End && *It < FirstInvalidIndex; ++It) + Collected.set(*It); +} + +void LiveDebugValues::getUsedRegs(const VarLocSet &CollectFrom, + SmallVectorImpl &UsedRegs) const { + // All register-based VarLocs are assigned indices greater than or equal to + // FirstRegIndex. + uint64_t FirstRegIndex = LocIndex::rawIndexForReg(1); + for (auto It = CollectFrom.find(FirstRegIndex), End = CollectFrom.end(); + It != End;) { + // We found a VarLoc ID for a VarLoc that lives in a register. Figure out + // which register and add it to UsedRegs. + uint32_t FoundReg = LocIndex::fromRawInteger(*It).Location; + assert((UsedRegs.empty() || FoundReg != UsedRegs.back()) && + "Duplicate used reg"); + UsedRegs.push_back(FoundReg); + + // Skip to the next /set/ register. Note that this finds a lower bound, so + // even if there aren't any VarLocs living in `FoundReg+1`, we're still + // guaranteed to move on to the next register (or to end()). + uint64_t NextRegIndex = LocIndex::rawIndexForReg(FoundReg + 1); + It = CollectFrom.find(NextRegIndex); + } +} + //===----------------------------------------------------------------------===// // Debug Range Extension Implementation //===----------------------------------------------------------------------===// @@ -723,7 +813,9 @@ void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF, raw_ostream &Out) const { Out << '\n' << msg << '\n'; for (const MachineBasicBlock &BB : MF) { - const VarLocSet &L = V.lookup(&BB); + if (!V.count(&BB)) + continue; + const VarLocSet &L = getVarLocsInMBB(&BB, V); if (L.empty()) continue; Out << "MBB: " << BB.getNumber() << ":\n"; @@ -863,7 +955,7 @@ void LiveDebugValues::emitEntryValues(MachineInstr &MI, OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs, TransferMap &Transfers, - SparseBitVector<> &KillSet) { + VarLocSet &KillSet) { for (uint64_t ID : KillSet) { LocIndex Idx = LocIndex::fromRawInteger(ID); const VarLoc &VL = VarLocIDs[Idx]; @@ -971,55 +1063,55 @@ void LiveDebugValues::transferRegisterDef( MachineFunction *MF = MI.getMF(); const TargetLowering *TLI = MF->getSubtarget().getTargetLowering(); unsigned SP = TLI->getStackPointerRegisterToSaveRestore(); - SparseBitVector<> KillSet; + + // Find the regs killed by MI, and find regmasks of preserved regs. // Max out the number of statically allocated elements in `DeadRegs`, as this - // prevents fallback to std::set::count() operations. As often as possible, we - // want a linear scan here. - SmallSet DeadRegs; - SmallVector DeadRegMasks; + // prevents fallback to std::set::count() operations. + SmallSet DeadRegs; + SmallVector RegMasks; for (const MachineOperand &MO : MI.operands()) { - // Determine whether the operand is a register def. Assume that call - // instructions never clobber SP, because some backends (e.g., AArch64) - // never list SP in the regmask. + // Determine whether the operand is a register def. if (MO.isReg() && MO.isDef() && MO.getReg() && Register::isPhysicalRegister(MO.getReg()) && !(MI.isCall() && MO.getReg() == SP)) { - // Remove ranges of all aliased registers. Note: the actual removal is - // done after we finish visiting MachineOperands, for performance reasons. + // Remove ranges of all aliased registers. for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI) // FIXME: Can we break out of this loop early if no insertion occurs? DeadRegs.insert(*RAI); } else if (MO.isRegMask()) { - // Remove ranges of all clobbered registers. Register masks don't usually - // list SP as preserved. While the debug info may be off for an - // instruction or two around callee-cleanup calls, transferring the - // DEBUG_VALUE across the call is still a better user experience. Note: - // the actual removal is done after we finish visiting MachineOperands, - // for performance reasons. - DeadRegMasks.push_back(MO.getRegMask()); + RegMasks.push_back(MO.getRegMask()); } } - // For performance reasons, it's critical to iterate over the open var locs - // at most once. - for (uint64_t ID : OpenRanges.getVarLocs()) { - LocIndex Idx = LocIndex::fromRawInteger(ID); - unsigned Reg = VarLocIDs[Idx].isDescribedByReg(); - if (!Reg) - continue; - if (DeadRegs.count(Reg)) { - KillSet.set(ID); - continue; - } + // Erase VarLocs which reside in one of the dead registers. For performance + // reasons, it's critical to not iterate over the full set of open VarLocs. + // Iterate over the set of dying/used regs instead. + VarLocSet KillSet(Alloc); + for (uint32_t DeadReg : DeadRegs) + collectIDsForReg(KillSet, DeadReg, OpenRanges.getVarLocs()); + if (!RegMasks.empty()) { + SmallVector UsedRegs; + getUsedRegs(OpenRanges.getVarLocs(), UsedRegs); + for (uint32_t Reg : UsedRegs) { + // The VarLocs residing in this register are already in the kill set. + if (DeadRegs.count(Reg)) + continue; - if (Reg == SP) - continue; - bool AnyRegMaskKillsReg = - any_of(DeadRegMasks, [Reg](const uint32_t *RegMask) { - return MachineOperand::clobbersPhysReg(RegMask, Reg); - }); - if (AnyRegMaskKillsReg) - KillSet.set(ID); + // Remove ranges of all clobbered registers. Register masks don't usually + // list SP as preserved. Assume that call instructions never clobber SP, + // because some backends (e.g., AArch64) never list SP in the regmask. + // While the debug info may be off for an instruction or two around + // callee-cleanup calls, transferring the DEBUG_VALUE across the call is + // still a better user experience. + if (Reg == SP) + continue; + bool AnyRegMaskKillsReg = + any_of(RegMasks, [Reg](const uint32_t *RegMask) { + return MachineOperand::clobbersPhysReg(RegMask, Reg); + }); + if (AnyRegMaskKillsReg) + collectIDsForReg(KillSet, Reg, OpenRanges.getVarLocs()); + } } OpenRanges.erase(KillSet, VarLocIDs); @@ -1119,7 +1211,7 @@ void LiveDebugValues::transferSpillOrRestoreInst(MachineInstr &MI, // First, if there are any DBG_VALUEs pointing at a spill slot that is // written to, then close the variable location. The value in memory // will have changed. - VarLocSet KillSet; + VarLocSet KillSet(Alloc); if (isSpillInstruction(MI, MF)) { Loc = extractSpillBaseRegAndOffset(MI); for (uint64_t ID : OpenRanges.getVarLocs()) { @@ -1264,7 +1356,7 @@ bool LiveDebugValues::transferTerminator(MachineBasicBlock *CurMBB, dbgs() << "Add to OutLocs in MBB #" << CurMBB->getNumber() << ": "; VarLocIDs[LocIndex::fromRawInteger(ID)].dump(TRI); }); - VarLocSet &VLS = OutLocs[CurMBB]; + VarLocSet &VLS = getVarLocsInMBB(CurMBB, OutLocs); Changed = VLS != OpenRanges.getVarLocs(); // New OutLocs set may be different due to spill, restore or register // copy instruction processing. @@ -1356,7 +1448,7 @@ bool LiveDebugValues::join( LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n"); bool Changed = false; - VarLocSet InLocsT; // Temporary incoming locations. + VarLocSet InLocsT(Alloc); // Temporary incoming locations. // For all predecessors of this MBB, find the set of VarLocs that // can be joined. @@ -1399,7 +1491,7 @@ bool LiveDebugValues::join( } // Filter out DBG_VALUES that are out of scope. - VarLocSet KillSet; + VarLocSet KillSet(Alloc); bool IsArtificial = ArtificialBlocks.count(&MBB); if (!IsArtificial) { for (uint64_t ID : InLocsT) { @@ -1421,32 +1513,28 @@ bool LiveDebugValues::join( assert((NumVisited || MBB.pred_empty()) && "Should have processed at least one predecessor"); - VarLocSet &ILS = InLocs[&MBB]; - VarLocSet &Pending = PendingInLocs[&MBB]; + VarLocSet &ILS = getVarLocsInMBB(&MBB, InLocs); + VarLocSet &Pending = getVarLocsInMBB(&MBB, PendingInLocs); // New locations will have DBG_VALUE insts inserted at the start of the // block, after location propagation has finished. Record the insertions // that we need to perform in the Pending set. VarLocSet Diff = InLocsT; Diff.intersectWithComplement(ILS); - for (auto ID : Diff) { - Pending.set(ID); - ILS.set(ID); - ++NumInserted; - Changed = true; - } + Pending.set(Diff); + ILS.set(Diff); + NumInserted += Diff.count(); + Changed |= !Diff.empty(); // We may have lost locations by learning about a predecessor that either // loses or moves a variable. Find any locations in ILS that are not in the // new in-locations, and delete those. VarLocSet Removed = ILS; Removed.intersectWithComplement(InLocsT); - for (auto ID : Removed) { - Pending.reset(ID); - ILS.reset(ID); - ++NumRemoved; - Changed = true; - } + Pending.intersectWithComplement(Removed); + ILS.intersectWithComplement(Removed); + NumRemoved += Removed.count(); + Changed |= !Removed.empty(); return Changed; } @@ -1567,7 +1655,7 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) { VarLocMap VarLocIDs; // Map VarLoc<>unique ID for use in bitvectors. OverlapMap OverlapFragments; // Map of overlapping variable fragments. - OpenRangesSet OpenRanges(OverlapFragments); + OpenRangesSet OpenRanges(Alloc, OverlapFragments); // Ranges that are open until end of bb. VarLocInMBB OutLocs; // Ranges that exist beyond bb. VarLocInMBB InLocs; // Ranges that are incoming after joining. @@ -1607,7 +1695,7 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) { // Initialize per-block structures and scan for fragment overlaps. for (auto &MBB : MF) { - PendingInLocs[&MBB] = VarLocSet(); + PendingInLocs.try_emplace(&MBB, Alloc); for (auto &MI : MBB) { if (MI.isDebugValue()) @@ -1659,7 +1747,8 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) { // examine spill, copy and restore instructions to see whether they // operate with registers that correspond to user variables. // First load any pending inlocs. - OpenRanges.insertFromLocSet(PendingInLocs[MBB], VarLocIDs); + OpenRanges.insertFromLocSet(getVarLocsInMBB(MBB, PendingInLocs), + VarLocIDs); for (auto &MI : *MBB) process(MI, OpenRanges, VarLocIDs, Transfers); OLChanged |= transferTerminator(MBB, OpenRanges, OutLocs, VarLocIDs); diff --git a/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir b/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir index 2396daa..5e97187 100644 --- a/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir +++ b/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir @@ -4,6 +4,8 @@ # block structure relevant to the debug entry values propagation. # # CHECK: ![[ARG_B:.*]] = !DILocalVariable(name: "b" +# CHECK: ![[ARG_C:.*]] = !DILocalVariable(name: "c" +# CHECK: ![[ARG_Q:.*]] = !DILocalVariable(name: "q" # CHECK: bb.0.entry # CHECK: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression() # CHECK: bb.1.if.then @@ -21,8 +23,9 @@ # CHECK-NEXT: $ebx = MOV32ri 2 # CHECK-NEXT: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression(DW_OP_LLVM_entry_value, 1) # CHECK: bb.3.if.end +# CHECK-NEXT: DBG_VALUE $edx, $noreg, ![[ARG_Q]], !DIExpression() +# CHECK-NEXT: DBG_VALUE $ebp, $noreg, ![[ARG_C]], !DIExpression() # CHECK-NEXT: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression(DW_OP_LLVM_entry_value, 1) -# --- | ; ModuleID = 'test.c' source_filename = "test.c" diff --git a/llvm/test/DebugInfo/MIR/X86/multiple-param-dbg-value-entry.mir b/llvm/test/DebugInfo/MIR/X86/multiple-param-dbg-value-entry.mir index fd154ed..baa5a00 100644 --- a/llvm/test/DebugInfo/MIR/X86/multiple-param-dbg-value-entry.mir +++ b/llvm/test/DebugInfo/MIR/X86/multiple-param-dbg-value-entry.mir @@ -10,8 +10,8 @@ # Verify that DW_OP_LLVM_entry_values are generated for parameters with multiple # DBG_VALUEs at entry block. # CHECK: DBG_VALUE $edi, $noreg, !{{.*}}, !DIExpression(DW_OP_LLVM_entry_value, 1), debug-location {{.*}} -# CHECK: DBG_VALUE $edx, $noreg, !{{.*}}, !DIExpression(DW_OP_LLVM_entry_value, 1), debug-location {{.*}} # CHECK: DBG_VALUE $esi, $noreg, !{{.*}}, !DIExpression(DW_OP_LLVM_entry_value, 1), debug-location {{.*}} +# CHECK: DBG_VALUE $edx, $noreg, !{{.*}}, !DIExpression(DW_OP_LLVM_entry_value, 1), debug-location {{.*}} --- | ; ModuleID = 'multiple-param-dbg-value-entry.ll' -- 2.7.4