From ae7f468073e448d9a5c39364a5cc4241525c4896 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Sat, 4 Dec 2021 13:14:15 +0100 Subject: [PATCH] [NewPM] Fix MergeFunctions scheduling MergeFunctions (as well as HotColdSplitting an IROutliner) are incorrectly scheduled under the new pass manager. The code makes it look like they run towards the end of the module optimization pipeline (as they should), while in reality the run at the start. This is because the OptimizePM populated around them is only scheduled later. I'm fixing this by moving these three passes until after OptimizePM to avoid splitting the function pass pipeline. It doesn't seem important to me that some of the function passes run after these late module passes. Differential Revision: https://reviews.llvm.org/D115098 --- llvm/lib/Passes/PassBuilderPipelines.cpp | 34 +++++++++++----------- .../PhaseOrdering/X86/merge-functions.ll | 11 ++----- 2 files changed, 19 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 0cc329c..490c698 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1182,23 +1182,6 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false); - // Split out cold code. Splitting is done late to avoid hiding context from - // other optimizations and inadvertently regressing performance. The tradeoff - // is that this has a higher code size cost than splitting early. - if (EnableHotColdSplit && !LTOPreLink) - MPM.addPass(HotColdSplittingPass()); - - // Search the code for similar regions of code. If enough similar regions can - // be found where extracting the regions into their own function will decrease - // the size of the program, we extract the regions, a deduplicate the - // structurally similar regions. - if (EnableIROutliner) - MPM.addPass(IROutlinerPass()); - - // Merge functions if requested. - if (PTO.MergeFunctions) - MPM.addPass(MergeFunctionsPass()); - // LoopSink pass sinks instructions hoisted by LICM, which serves as a // canonicalization pass that enables other optimizations. As a result, // LoopSink pass needs to be a very late IR pass to avoid undoing LICM @@ -1226,6 +1209,23 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, for (auto &C : OptimizerLastEPCallbacks) C(MPM, Level); + // Split out cold code. Splitting is done late to avoid hiding context from + // other optimizations and inadvertently regressing performance. The tradeoff + // is that this has a higher code size cost than splitting early. + if (EnableHotColdSplit && !LTOPreLink) + MPM.addPass(HotColdSplittingPass()); + + // Search the code for similar regions of code. If enough similar regions can + // be found where extracting the regions into their own function will decrease + // the size of the program, we extract the regions, a deduplicate the + // structurally similar regions. + if (EnableIROutliner) + MPM.addPass(IROutlinerPass()); + + // Merge functions if requested. + if (PTO.MergeFunctions) + MPM.addPass(MergeFunctionsPass()); + if (PTO.CallGraphProfile) MPM.addPass(CGProfilePass()); diff --git a/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll index aa83692..39cd34a 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll @@ -90,15 +90,8 @@ bb3: ; preds = %bb1, %bb2 define i1 @test2(i32 %c) { ; CHECK-LABEL: @test2( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[SWITCH_TABLEIDX:%.*]] = add i32 [[C:%.*]], -100 -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i32 [[SWITCH_TABLEIDX]], 20 -; CHECK-NEXT: [[SWITCH_CAST:%.*]] = trunc i32 [[SWITCH_TABLEIDX]] to i20 -; CHECK-NEXT: [[SWITCH_DOWNSHIFT:%.*]] = lshr i20 -490991, [[SWITCH_CAST]] -; CHECK-NEXT: [[TMP1:%.*]] = and i20 [[SWITCH_DOWNSHIFT]], 1 -; CHECK-NEXT: [[SWITCH_MASKED:%.*]] = icmp ne i20 [[TMP1]], 0 -; CHECK-NEXT: [[I_0:%.*]] = select i1 [[TMP0]], i1 [[SWITCH_MASKED]], i1 false -; CHECK-NEXT: ret i1 [[I_0]] +; CHECK-NEXT: [[TMP2:%.*]] = tail call i1 @test1(i32 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] +; CHECK-NEXT: ret i1 [[TMP2]] ; entry: %i = alloca i8, align 1 -- 2.7.4