class AllocCombinerPass : public BinaryFunctionPass {
/// Stats aggregating variables
uint64_t NumCombined{0};
+ uint64_t DynamicCountCombined{0};
DenseSet<const BinaryFunction *> FuncsChanged;
void combineAdjustments(BinaryFunction &BF);
void run() {
derived().preflight();
+ if (Func.begin() == Func.end())
+ return;
// Initialize state for all points of the function
for (BinaryBasicBlock &BB : Func) {
StateTy &St = getOrCreateStateAt(BB);
St = derived().getStartingStateAtPoint(Inst);
}
}
- assert(Func.begin() != Func.end() && "Unexpected empty function");
std::queue<BinaryBasicBlock *> Worklist;
// TODO: Pushing this in a DFS ordering will greatly speed up the dataflow
/// Analysis stats counters
uint64_t NumFunctionsNotOptimized{0};
uint64_t NumFunctionsFailedRestoreFI{0};
- uint64_t CountFunctionsNotOptimized{0};
uint64_t CountFunctionsFailedRestoreFI{0};
uint64_t CountDenominator{0};
/// Stats aggregating variables
uint64_t NumRedundantLoads{0};
uint64_t NumRedundantStores{0};
- uint64_t NumLoadsChangedToReg{0};
- uint64_t NumLoadsChangedToImm{0};
+ uint64_t FreqRedundantLoads{0};
+ uint64_t FreqRedundantStores{0};
+ uint64_t FreqLoadsChangedToReg{0};
+ uint64_t FreqLoadsChangedToImm{0};
uint64_t NumLoadsDeleted{0};
+ uint64_t FreqLoadsDeleted{0};
DenseSet<const BinaryFunction *> FuncsChanged;
/// Pass stats
static std::atomic_uint64_t SpillsMovedRegularMode;
static std::atomic_uint64_t SpillsMovedPushPopMode;
+ static std::atomic_uint64_t SpillsMovedDynamicCount;
+ static std::atomic_uint64_t SpillsFailedDynamicCount;
+ static std::atomic_uint64_t InstrDynamicCount;
+ static std::atomic_uint64_t StoreDynamicCount;
Optional<unsigned> AnnotationIndex;
BC.MIB->removeAnnotation(Inst, getAnnotationIndex());
}
- bool perform();
+ bool perform(bool HotOnly = false);
static void printStats();
};
uint64_t BBExecCount = BB->getExecutionCount();
if (BBExecCount == BinaryBasicBlock::COUNT_NO_PROFILE)
continue;
- TotalScore += BBExecCount;
+ TotalScore += BBExecCount * BB->getNumNonPseudos();
}
FunctionScore = TotalScore;
return FunctionScore;
BB.eraseInstruction(BB.findInstruction(Prev));
++NumCombined;
+ DynamicCountCombined += BB.getKnownExecutionCount();
FuncsChanged.insert(&BF);
Prev = &Inst;
}
});
outs() << "BOLT-INFO: Allocation combiner: " << NumCombined
- << " empty spaces coalesced.\n";
+ << " empty spaces coalesced (dyn count: " << DynamicCountCombined
+ << ").\n";
}
} // end namespace bolt
}
for (auto &I : BC.getBinaryFunctions()) {
- uint64_t Count = I.second.getExecutionCount();
- if (Count != BinaryFunction::COUNT_NO_PROFILE)
- CountDenominator += Count;
+ CountDenominator += I.second.getFunctionScore();
// "shouldOptimize" for passes that run after finalize
if (!(I.second.isSimple() && I.second.hasCFG() && !I.second.isIgnored()) ||
!opts::shouldFrameOptimize(I.second)) {
++NumFunctionsNotOptimized;
- if (Count != BinaryFunction::COUNT_NO_PROFILE)
- CountFunctionsNotOptimized += Count;
continue;
}
"FA breakdown", opts::TimeFA);
if (!restoreFrameIndex(I.second)) {
++NumFunctionsFailedRestoreFI;
- uint64_t Count = I.second.getExecutionCount();
- if (Count != BinaryFunction::COUNT_NO_PROFILE)
- CountFunctionsFailedRestoreFI += Count;
+ CountFunctionsFailedRestoreFI += I.second.getFunctionScore();
continue;
}
}
void FrameAnalysis::printStats() {
outs() << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsNotOptimized
- << " function(s) "
- << format("(%.1lf%% dyn cov)",
- (100.0 * CountFunctionsNotOptimized / CountDenominator))
- << " were not optimized.\n"
+ << " function(s) were not optimized.\n"
<< "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsFailedRestoreFI
<< " function(s) "
<< format("(%.1lf%% dyn cov)",
continue;
++NumRedundantLoads;
+ FreqRedundantLoads += BB.getKnownExecutionCount();
Changed = true;
LLVM_DEBUG(dbgs() << "Redundant load instruction: ");
LLVM_DEBUG(Inst.dump());
LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n");
break;
}
- ++NumLoadsChangedToReg;
+ FreqLoadsChangedToReg += BB.getKnownExecutionCount();
MIB->removeAnnotation(Inst, "FrameAccessEntry");
LLVM_DEBUG(dbgs() << "Changed operand to a reg\n");
if (MIB->isRedundantMove(Inst)) {
++NumLoadsDeleted;
+ FreqLoadsDeleted += BB.getKnownExecutionCount();
LLVM_DEBUG(dbgs() << "Created a redundant move\n");
// Delete it!
ToErase.push_front(std::make_pair(&BB, &Inst));
if (!MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) {
LLVM_DEBUG(dbgs() << "FAILED\n");
} else {
- ++NumLoadsChangedToImm;
+ FreqLoadsChangedToImm += BB.getKnownExecutionCount();
MIB->removeAnnotation(Inst, "FrameAccessEntry");
LLVM_DEBUG(dbgs() << "Ok\n");
}
continue;
++NumRedundantStores;
+ FreqRedundantStores += BB.getKnownExecutionCount();
Changed = true;
LLVM_DEBUG(dbgs() << "Unused store instruction: ");
LLVM_DEBUG(Inst.dump());
outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
<< " redundant load(s) and " << NumRedundantStores
<< " unused store(s)\n";
- outs() << "BOLT-INFO: FOP changed " << NumLoadsChangedToReg
- << " load(s) to use a register instead of a stack access, and "
- << NumLoadsChangedToImm << " to use an immediate.\n"
- << "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s) and "
- << NumRedundantStores << " store(s).\n";
+ outs() << "BOLT-INFO: Frequency of redundant loads is " << FreqRedundantLoads
+ << " and frequency of unused stores is " << FreqRedundantStores
+ << "\n";
+ outs() << "BOLT-INFO: Frequency of loads changed to use a register is "
+ << FreqLoadsChangedToReg
+ << " and frequency of loads changed to use an immediate is "
+ << FreqLoadsChangedToImm << "\n";
+ outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted
+ << " load(s) (dyn count: " << FreqLoadsDeleted << ") and "
+ << NumRedundantStores << " store(s)\n";
FA->printStats();
ShrinkWrapping::printStats();
}
BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos");
BC.MIB->getOrCreateAnnotationIndex("DeleteMe");
- ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
- if (!FA.hasFrameInfo(BF))
- return true;
-
- if (opts::FrameOptimization == FOP_HOT &&
- (BF.getKnownExecutionCount() < BC.getHotThreshold()))
- return true;
+ std::vector<std::pair<uint64_t, const BinaryFunction *>> Top10Funcs;
+ auto LogFunc = [&](BinaryFunction &BF) {
+ auto Lower = std::lower_bound(
+ Top10Funcs.begin(), Top10Funcs.end(), BF.getKnownExecutionCount(),
+ [](const std::pair<uint64_t, const BinaryFunction *> &Elmt,
+ uint64_t Value) { return Elmt.first > Value; });
+ if (Lower == Top10Funcs.end() && Top10Funcs.size() >= 10)
+ return;
+ Top10Funcs.insert(Lower,
+ std::make_pair<>(BF.getKnownExecutionCount(), &BF));
+ if (Top10Funcs.size() > 10)
+ Top10Funcs.resize(10);
+ };
+ (void)LogFunc;
- if (BF.getKnownExecutionCount() == 0)
+ ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
+ if (BF.getFunctionScore() == 0)
return true;
return false;
};
+ const bool HotOnly = opts::FrameOptimization == FOP_HOT;
+
ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
[&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
DataflowInfoManager Info(BF, &RA, &FA, AllocatorId);
ShrinkWrapping SW(FA, BF, Info, AllocatorId);
- if (SW.perform()) {
+ if (SW.perform(HotOnly)) {
std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
FuncsChanged.insert(&BF);
+ LLVM_DEBUG(LogFunc(BF));
}
};
ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction,
SkipPredicate, "shrink-wrapping");
+
+ if (!Top10Funcs.empty()) {
+ outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n";
+ for (const auto &Elmt : Top10Funcs)
+ outs() << Elmt.first << " : " << Elmt.second->getPrintName() << "\n";
+ }
}
} // namespace bolt
std::atomic_uint64_t ShrinkWrapping::SpillsMovedRegularMode{0};
std::atomic_uint64_t ShrinkWrapping::SpillsMovedPushPopMode{0};
+std::atomic_uint64_t ShrinkWrapping::SpillsMovedDynamicCount{0};
+std::atomic_uint64_t ShrinkWrapping::SpillsFailedDynamicCount{0};
+std::atomic_uint64_t ShrinkWrapping::InstrDynamicCount{0};
+std::atomic_uint64_t ShrinkWrapping::StoreDynamicCount{0};
using BBIterTy = BinaryBasicBlock::iterator;
// Keeps info about successfully moved regs: reg index, save position and
// save size
std::vector<std::tuple<unsigned, MCInst *, size_t>> MovedRegs;
+ uint64_t TotalEstimatedWin = 0;
for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) {
MCInst *BestPosSave = nullptr;
- uint64_t TotalEstimatedWin = 0;
- if (!isBestSavePosCold(I, BestPosSave, TotalEstimatedWin))
+ uint64_t EstimatedWin = 0;
+ if (!isBestSavePosCold(I, BestPosSave, EstimatedWin))
continue;
SmallVector<ProgramPoint, 4> RestorePoints =
- doRestorePlacement(BestPosSave, I, TotalEstimatedWin);
- if (RestorePoints.empty())
+ doRestorePlacement(BestPosSave, I, EstimatedWin);
+ if (RestorePoints.empty()) {
+ SpillsFailedDynamicCount += EstimatedWin;
continue;
+ }
const FrameIndexEntry *FIESave = CSA.SaveFIEByReg[I];
const FrameIndexEntry *FIELoad = CSA.LoadFIEByReg[I];
// If we don't know stack state at this point, bail
if ((SPFP.first == SPT.SUPERPOSITION || SPFP.first == SPT.EMPTY) &&
- (SPFP.second == SPT.SUPERPOSITION || SPFP.second == SPT.EMPTY))
+ (SPFP.second == SPT.SUPERPOSITION || SPFP.second == SPT.EMPTY)) {
+ SpillsFailedDynamicCount += EstimatedWin;
continue;
+ }
// Operation mode: if true, will insert push/pops instead of loads/restores
bool UsePushPops = validatePushPopsMode(I, BestPosSave, SaveOffset);
scheduleOldSaveRestoresRemoval(I, UsePushPops);
scheduleSaveRestoreInsertions(I, BestPosSave, RestorePoints, UsePushPops);
MovedRegs.emplace_back(std::make_tuple(I, BestPosSave, SaveSize));
+ TotalEstimatedWin += EstimatedWin;
}
// Revert push-pop mode if it failed for a single CSR
}
}
}
+ SpillsMovedDynamicCount += TotalEstimatedWin;
// Update statistics
if (!UsedPushPopMode) {
}
}
-bool ShrinkWrapping::perform() {
+bool ShrinkWrapping::perform(bool HotOnly) {
HasDeletedOffsetCFIs = BitVector(BC.MRI->getNumRegs(), false);
PushOffsetByReg = std::vector<int64_t>(BC.MRI->getNumRegs(), 0LL);
PopOffsetByReg = std::vector<int64_t>(BC.MRI->getNumRegs(), 0LL);
DomOrder = std::vector<MCPhysReg>(BC.MRI->getNumRegs(), 0);
+ // Update pass statistics
+ uint64_t TotalInstrs = 0ULL;
+ uint64_t TotalStoreInstrs = 0ULL;
+ for (BinaryBasicBlock *BB : BF.layout()) {
+ uint64_t BBExecCount = BB->getExecutionCount();
+ if (!BBExecCount || BBExecCount == BinaryBasicBlock::COUNT_NO_PROFILE)
+ continue;
+ for (const auto &Instr : *BB) {
+ if (BC.MIB->isPseudo(Instr))
+ continue;
+ if (BC.MIB->isStore(Instr))
+ TotalStoreInstrs += BBExecCount;
+ TotalInstrs += BBExecCount;
+ }
+ }
+ InstrDynamicCount += TotalInstrs;
+ StoreDynamicCount += TotalStoreInstrs;
+
+ if (!FA.hasFrameInfo(BF))
+ return false;
+
+ if (HotOnly && (BF.getKnownExecutionCount() < BC.getHotThreshold()))
+ return false;
+
if (BF.checkForAmbiguousJumpTables()) {
LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ambiguous JTs in " << BF.getPrintName()
<< ".\n");
outs() << "BOLT-INFO: Shrink wrapping moved " << SpillsMovedRegularMode
<< " spills inserting load/stores and " << SpillsMovedPushPopMode
<< " spills inserting push/pops\n";
+ if (!InstrDynamicCount || !StoreDynamicCount)
+ return;
+ outs() << "BOLT-INFO: Shrink wrapping reduced " << SpillsMovedDynamicCount
+ << " store executions ("
+ << format("%.1lf%%",
+ (100.0 * SpillsMovedDynamicCount / InstrDynamicCount))
+ << " total instructions executed, "
+ << format("%.1lf%%",
+ (100.0 * SpillsMovedDynamicCount / StoreDynamicCount))
+ << " store instructions)\n";
+ outs() << "BOLT-INFO: Shrink wrapping failed at reducing "
+ << SpillsFailedDynamicCount << " store executions ("
+ << format("%.1lf%%",
+ (100.0 * SpillsFailedDynamicCount / InstrDynamicCount))
+ << " total instructions executed, "
+ << format("%.1lf%%",
+ (100.0 * SpillsFailedDynamicCount / StoreDynamicCount))
+ << " store instructions)\n";
}
// Operators necessary as a result of using MCAnnotation
# Check shrink wrapping results:
# CHECK: BOLT-INFO: Shrink wrapping moved 0 spills inserting load/stores and 2 spills inserting push/pops
+# CHECK: BOLT-INFO: Shrink wrapping reduced 6 store executions (28.6% total instructions executed, 100.0% store instructions)
+# CHECK: BOLT-INFO: Shrink wrapping failed at reducing 0 store executions (0.0% total instructions executed, 0.0% store instructions)
# Check that order is correct
# CHECK: Binary Function "_start" after frame-optimizer