From 2f32e5d84d3483a0d6170fc61d2cceb49fc930a3 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Fri, 20 Sep 2019 23:29:17 +0000 Subject: [PATCH] [Inliner] Remove incorrect early exit during switch cost computation Summary: The CallAnalyzer::visitSwitchInst has an early exit when the estimated lower bound of the switch cost will put the overall cost of the inline above the threshold. However, this code is not correctly estimating the lower bound for switches that can be transformed into bit tests, leading to unnecessary lost inlines, and also differing behavior with optimization remarks enabled. First, the early exit is controlled by whether ComputeFullInlineCost is enabled or not, and that in turn is disabled by default but enabled when enabling -pass-remarks=missed. This by itself wouldn't lead to a problem, except that as described below, the lower bound can be above the real lower bound, so we can sometimes get different inline decisions with inline remarks enabled, which is problematic. The early exit was added in along with a new switch cost model in D31085. The reason why this early exit was added is due to a concern one reviewer raised about compile time for large switches: https://reviews.llvm.org/D31085?id=94559#inline-276200 However, the code just below there calls getEstimatedNumberOfCaseClusters, which in turn immediately calls BasicTTIImpl getEstimatedNumberOfCaseClusters, which in the worst case does a linear scan of the cases to get the high and low values. The bit test handling in particular is guarded by whether the number of cases fits into the max bit width. There is no suggestion that anyone measured a compile time issue, it appears to be theoretical. The problem is that the reviewer's comment about the lower bound calculation is incorrect, specifically in the case of a switch that can be lowered to a bit test. This isn't followed up on the comment thread, but the author does add a FIXME to that effect above the early exit added when they subsequently revised the patch. As a result, we were incorrectly early exiting and not inlining functions with switch statements that would be lowered to bit tests in cases where we were nearing the threshold. Combined with the fact that this early exit was skipped with opt remarks enabled, this caused different inlining decisions to be made when -pass-remarks=missed is enabled to debug the missing inline. Remove the early exit for the above reasons. I also copied over an existing AArch64 inlining test to X86, and adjusted the threshold so that the bit test inline only occurs with the fix in this patch. Reviewers: davidxl Subscribers: eraman, kristof.beyls, haicheng, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67716 llvm-svn: 372440 --- llvm/lib/Analysis/InlineCost.cpp | 13 --- llvm/test/Transforms/Inline/X86/switch.ll | 160 ++++++++++++++++++++++++++++++ 2 files changed, 160 insertions(+), 13 deletions(-) create mode 100644 llvm/test/Transforms/Inline/X86/switch.ll diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index c6e5c1c..57dee45 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -1453,19 +1453,6 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { // Maximum valid cost increased in this function. int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; - // Exit early for a large switch, assuming one case needs at least one - // instruction. - // FIXME: This is not true for a bit test, but ignore such case for now to - // save compile-time. - int64_t CostLowerBound = - std::min((int64_t)CostUpperBound, - (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost); - - if (CostLowerBound > Threshold && !ComputeFullInlineCost) { - addCost((int64_t)SI.getNumCases() * InlineConstants::InstrCost); - return false; - } - unsigned JumpTableSize = 0; unsigned NumCaseCluster = TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize); diff --git a/llvm/test/Transforms/Inline/X86/switch.ll b/llvm/test/Transforms/Inline/X86/switch.ll new file mode 100644 index 0000000..5565c7a --- /dev/null +++ b/llvm/test/Transforms/Inline/X86/switch.ll @@ -0,0 +1,160 @@ +; RUN: opt < %s -inline -inline-threshold=1 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s +; RUN: opt < %s -passes='cgscc(inline)' -inline-threshold=1 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s + +define i32 @callee_range(i32 %a, i32* %P) { + switch i32 %a, label %sw.default [ + i32 0, label %sw.bb0 + i32 1000, label %sw.bb1 + i32 2000, label %sw.bb1 + i32 3000, label %sw.bb1 + i32 4000, label %sw.bb1 + i32 5000, label %sw.bb1 + i32 6000, label %sw.bb1 + i32 7000, label %sw.bb1 + i32 8000, label %sw.bb1 + i32 9000, label %sw.bb1 + ] + +sw.default: + store volatile i32 %a, i32* %P + br label %return +sw.bb0: + store volatile i32 %a, i32* %P + br label %return +sw.bb1: + store volatile i32 %a, i32* %P + br label %return +return: + ret i32 42 +} + +define i32 @caller_range(i32 %a, i32* %P) { +; CHECK-LABEL: @caller_range( +; CHECK: call i32 @callee_range + %r = call i32 @callee_range(i32 %a, i32* %P) + ret i32 %r +} + +define i32 @callee_bittest(i32 %a, i32* %P) { + switch i32 %a, label %sw.default [ + i32 0, label %sw.bb0 + i32 1, label %sw.bb1 + i32 2, label %sw.bb2 + i32 3, label %sw.bb0 + i32 4, label %sw.bb1 + i32 5, label %sw.bb2 + i32 6, label %sw.bb0 + i32 7, label %sw.bb1 + i32 8, label %sw.bb2 + ] + +sw.default: + store volatile i32 %a, i32* %P + br label %return + +sw.bb0: + store volatile i32 %a, i32* %P + br label %return + +sw.bb1: + store volatile i32 %a, i32* %P + br label %return + +sw.bb2: + br label %return + +return: + ret i32 42 +} + + +define i32 @caller_bittest(i32 %a, i32* %P) { +; CHECK-LABEL: @caller_bittest( +; CHECK-NOT: call i32 @callee_bittest + %r= call i32 @callee_bittest(i32 %a, i32* %P) + ret i32 %r +} + +define i32 @callee_jumptable(i32 %a, i32* %P) { + switch i32 %a, label %sw.default [ + i32 1001, label %sw.bb101 + i32 1002, label %sw.bb102 + i32 1003, label %sw.bb103 + i32 1004, label %sw.bb104 + i32 1005, label %sw.bb101 + i32 1006, label %sw.bb102 + i32 1007, label %sw.bb103 + i32 1008, label %sw.bb104 + i32 1009, label %sw.bb101 + i32 1010, label %sw.bb102 + i32 1011, label %sw.bb103 + i32 1012, label %sw.bb104 + ] + +sw.default: + br label %return + +sw.bb101: + store volatile i32 %a, i32* %P + br label %return + +sw.bb102: + store volatile i32 %a, i32* %P + br label %return + +sw.bb103: + store volatile i32 %a, i32* %P + br label %return + +sw.bb104: + store volatile i32 %a, i32* %P + br label %return + +return: + ret i32 42 +} + +define i32 @caller_jumptable(i32 %a, i32 %b, i32* %P) { +; CHECK-LABEL: @caller_jumptable( +; CHECK: call i32 @callee_jumptable + %r = call i32 @callee_jumptable(i32 %b, i32* %P) + ret i32 %r +} + + +define internal i32 @callee_negativeCost(i32 %t) { +entry: + switch i32 %t, label %sw.default [ + i32 1, label %sw.bb + i32 0, label %sw.bb1 + i32 42, label %sw.bb2 + i32 43, label %sw.bb3 + ] + +sw.bb: ; preds = %entry + br label %cleanup + +sw.bb1: ; preds = %entry + br label %cleanup + +sw.bb2: ; preds = %entry + br label %cleanup + +sw.bb3: ; preds = %entry + br label %cleanup + +sw.default: ; preds = %entry + br label %cleanup + +cleanup: ; preds = %sw.default, %sw.bb3, %sw.bb2, %sw.bb1, %sw.bb + %retval.0 = phi i32 [ 1, %sw.default ], [ 3, %sw.bb3 ], [ 2, %sw.bb2 ], [ 0, %sw.bb1 ], [ 0, %sw.bb ] + ret i32 %retval.0 +} + +define i32 @caller_negativeCost(i32 %t) { +; CHECK-LABEL: @caller_negativeCost( +; CHECK-NOT: call i32 @callee_negativeCost +entry: + %call = call i32 @callee_negativeCost(i32 %t) + ret i32 %call +} -- 2.7.4