From fb45f3c9486f5d9e3003db95386432562b23577c Mon Sep 17 00:00:00 2001 From: Ruobing Han Date: Sun, 4 Sep 2022 12:56:25 -0400 Subject: [PATCH] [SimpleLoopUnswitch] Skip non-trivial unswitching of cold functions In the current main branch, all cold loops will not be applied non-trivial unswitch. As reported in D129599, skipping these cold loops will incur regression in SPEC benchmark. Thus, instead of skipping cold loops, now only skipping loops in cold functions. Reviewed By: alexgatea, aeubanks Differential Revision: https://reviews.llvm.org/D133275 --- llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 2 +- .../SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll | 80 +++------------------- .../SimpleLoopUnswitch/PGO-nontrivial-unswitch2.ll | 71 +++++++++++-------- 3 files changed, 51 insertions(+), 102 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 70f97d8..e1d5bb5 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -3086,7 +3086,7 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, // Skip cold loops, as unswitching them brings little benefit // but increases the code size if (PSI && PSI->hasProfileSummary() && BFI && - PSI->isColdBlock(L.getHeader(), BFI)) { + PSI->isFunctionColdInCallGraph(L.getHeader()->getParent(), *BFI)) { LLVM_DEBUG(dbgs() << " Skip cold loop: " << L << "\n"); return false; } diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll index c442f04..eeb5014 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll @@ -6,89 +6,27 @@ declare i32 @a() declare i32 @b() - +; Check loops in cold functions will not be applied non-trivial loop unswitch define void @f1(i32 %i, i1 %cond, i1 %hot_cond, i1 %cold_cond, i1* %ptr) !prof !0 { ; CHECK-LABEL: @f1( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[ENTRY_HOT_LOOP:%.*]] -; CHECK: entry_hot_loop: -; CHECK-NEXT: br i1 [[HOT_COND:%.*]], label [[HOT_LOOP_BEGIN_PREHEADER:%.*]], label [[HOT_LOOP_EXIT:%.*]], !prof [[PROF15:![0-9]+]] -; CHECK: hot_loop_begin.preheader: -; CHECK-NEXT: br i1 [[COND:%.*]], label [[HOT_LOOP_BEGIN_PREHEADER_SPLIT_US:%.*]], label [[HOT_LOOP_BEGIN_PREHEADER_SPLIT:%.*]] -; CHECK: hot_loop_begin.preheader.split.us: -; CHECK-NEXT: br label [[HOT_LOOP_BEGIN_US:%.*]] -; CHECK: hot_loop_begin.us: -; CHECK-NEXT: br label [[HOT_LOOP_A_US:%.*]] -; CHECK: hot_loop_a.us: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @a() -; CHECK-NEXT: br label [[HOT_LOOP_LATCH_US:%.*]] -; CHECK: hot_loop_latch.us: -; CHECK-NEXT: [[V1_US:%.*]] = load i1, i1* [[PTR:%.*]], align 1 -; CHECK-NEXT: br i1 [[V1_US]], label [[HOT_LOOP_BEGIN_US]], label [[HOT_LOOP_EXIT_LOOPEXIT_SPLIT_US:%.*]] -; CHECK: hot_loop_exit.loopexit.split.us: -; CHECK-NEXT: br label [[HOT_LOOP_EXIT_LOOPEXIT:%.*]] -; CHECK: hot_loop_begin.preheader.split: -; CHECK-NEXT: br label [[HOT_LOOP_BEGIN:%.*]] -; CHECK: hot_loop_begin: -; CHECK-NEXT: br label [[HOT_LOOP_B:%.*]] -; CHECK: hot_loop_b: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @b() -; CHECK-NEXT: br label [[HOT_LOOP_LATCH:%.*]] -; CHECK: hot_loop_latch: -; CHECK-NEXT: [[V1:%.*]] = load i1, i1* [[PTR]], align 1 -; CHECK-NEXT: br i1 [[V1]], label [[HOT_LOOP_BEGIN]], label [[HOT_LOOP_EXIT_LOOPEXIT_SPLIT:%.*]] -; CHECK: hot_loop_exit.loopexit.split: -; CHECK-NEXT: br label [[HOT_LOOP_EXIT_LOOPEXIT]] -; CHECK: hot_loop_exit.loopexit: -; CHECK-NEXT: br label [[HOT_LOOP_EXIT]] -; CHECK: hot_loop_exit: -; CHECK-NEXT: br label [[ENTRY_COLD_LOOP:%.*]] -; CHECK: entry_cold_loop: -; CHECK-NEXT: br i1 [[COLD_COND:%.*]], label [[COLD_LOOP_BEGIN_PREHEADER:%.*]], label [[COLD_LOOP_EXIT:%.*]], !prof [[PROF16:![0-9]+]] -; CHECK: cold_loop_begin.preheader: ; CHECK-NEXT: br label [[COLD_LOOP_BEGIN:%.*]] ; CHECK: cold_loop_begin: -; CHECK-NEXT: br i1 [[COND]], label [[COLD_LOOP_A:%.*]], label [[COLD_LOOP_B:%.*]] +; CHECK-NEXT: br i1 [[COND:%.*]], label [[COLD_LOOP_A:%.*]], label [[COLD_LOOP_B:%.*]] ; CHECK: cold_loop_a: -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @a() +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @a() ; CHECK-NEXT: br label [[COLD_LOOP_LATCH:%.*]] ; CHECK: cold_loop_b: -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @b() +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @b() ; CHECK-NEXT: br label [[COLD_LOOP_LATCH]] ; CHECK: cold_loop_latch: -; CHECK-NEXT: [[V2:%.*]] = load i1, i1* [[PTR]], align 1 -; CHECK-NEXT: br i1 [[V2]], label [[COLD_LOOP_BEGIN]], label [[COLD_LOOP_EXIT_LOOPEXIT:%.*]] -; CHECK: cold_loop_exit.loopexit: -; CHECK-NEXT: br label [[COLD_LOOP_EXIT]] +; CHECK-NEXT: [[V2:%.*]] = load i1, i1* [[PTR:%.*]], align 1 +; CHECK-NEXT: br i1 [[V2]], label [[COLD_LOOP_BEGIN]], label [[COLD_LOOP_EXIT:%.*]] ; CHECK: cold_loop_exit: ; CHECK-NEXT: ret void ; entry: - br label %entry_hot_loop - -entry_hot_loop: - br i1 %hot_cond, label %hot_loop_begin, label %hot_loop_exit, !prof !15 - -hot_loop_begin: - br i1 %cond, label %hot_loop_a, label %hot_loop_b - -hot_loop_a: - call i32 @a() - br label %hot_loop_latch - -hot_loop_b: - call i32 @b() - br label %hot_loop_latch - -hot_loop_latch: - %v1 = load i1, i1* %ptr - br i1 %v1, label %hot_loop_begin, label %hot_loop_exit - -hot_loop_exit: - br label %entry_cold_loop - -entry_cold_loop: - br i1 %cold_cond, label %cold_loop_begin, label %cold_loop_exit, !prof !16 + br label %cold_loop_begin cold_loop_begin: br i1 %cond, label %cold_loop_a, label %cold_loop_b @@ -110,7 +48,7 @@ cold_loop_exit: } !llvm.module.flags = !{!1} -!0 = !{!"function_entry_count", i64 400} +!0 = !{!"function_entry_count", i64 0} !1 = !{i32 1, !"ProfileSummary", !2} !2 = !{!3, !4, !5, !6, !7, !8, !9, !10} !3 = !{!"ProfileFormat", !"InstrProf"} @@ -125,5 +63,3 @@ cold_loop_exit: !12 = !{i32 10000, i64 100, i32 1} !13 = !{i32 999000, i64 100, i32 1} !14 = !{i32 999999, i64 1, i32 2} -!15 = !{!"branch_weights", i32 100, i32 0} -!16 = !{!"branch_weights", i32 0, i32 100} diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch2.ll b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch2.ll index cc3df2f..452b4d8 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch2.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch2.ll @@ -8,25 +8,38 @@ declare i32 @b() ; Check loops will be applied non-trivial loop unswitch in a non-cold function, ; even loop headers are cold -define void @f1(i32 %i, i1 %cond, i1 %hot_cond, i1 %cold_cond, i1* %ptr) !prof !0 { +define void @f1(i32 %i, i1 %cond, i1 %hot_cond, i1 %cold_cond, i1* %ptr) !prof !14 { ; CHECK-LABEL: @f1( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[ENTRY_COLD_LOOP:%.*]] ; CHECK: entry_cold_loop: ; CHECK-NEXT: br i1 [[COLD_COND:%.*]], label [[COLD_LOOP_BEGIN_PREHEADER:%.*]], label [[COLD_LOOP_EXIT:%.*]], !prof [[PROF15:![0-9]+]] ; CHECK: cold_loop_begin.preheader: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[COLD_LOOP_BEGIN_PREHEADER_SPLIT_US:%.*]], label [[COLD_LOOP_BEGIN_PREHEADER_SPLIT:%.*]] +; CHECK: cold_loop_begin.preheader.split.us: +; CHECK-NEXT: br label [[COLD_LOOP_BEGIN_US:%.*]] +; CHECK: cold_loop_begin.us: +; CHECK-NEXT: br label [[COLD_LOOP_A_US:%.*]] +; CHECK: cold_loop_a.us: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @a() +; CHECK-NEXT: br label [[COLD_LOOP_LATCH_US:%.*]] +; CHECK: cold_loop_latch.us: +; CHECK-NEXT: [[V2_US:%.*]] = load i1, i1* [[PTR:%.*]], align 1 +; CHECK-NEXT: br i1 [[V2_US]], label [[COLD_LOOP_BEGIN_US]], label [[COLD_LOOP_EXIT_LOOPEXIT_SPLIT_US:%.*]] +; CHECK: cold_loop_exit.loopexit.split.us: +; CHECK-NEXT: br label [[COLD_LOOP_EXIT_LOOPEXIT:%.*]] +; CHECK: cold_loop_begin.preheader.split: ; CHECK-NEXT: br label [[COLD_LOOP_BEGIN:%.*]] ; CHECK: cold_loop_begin: -; CHECK-NEXT: br i1 [[COND:%.*]], label [[COLD_LOOP_A:%.*]], label [[COLD_LOOP_B:%.*]] -; CHECK: cold_loop_a: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @a() -; CHECK-NEXT: br label [[COLD_LOOP_LATCH:%.*]] +; CHECK-NEXT: br label [[COLD_LOOP_B:%.*]] ; CHECK: cold_loop_b: ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @b() -; CHECK-NEXT: br label [[COLD_LOOP_LATCH]] +; CHECK-NEXT: br label [[COLD_LOOP_LATCH:%.*]] ; CHECK: cold_loop_latch: -; CHECK-NEXT: [[V2:%.*]] = load i1, i1* [[PTR:%.*]], align 1 -; CHECK-NEXT: br i1 [[V2]], label [[COLD_LOOP_BEGIN]], label [[COLD_LOOP_EXIT_LOOPEXIT:%.*]] +; CHECK-NEXT: [[V2:%.*]] = load i1, i1* [[PTR]], align 1 +; CHECK-NEXT: br i1 [[V2]], label [[COLD_LOOP_BEGIN]], label [[COLD_LOOP_EXIT_LOOPEXIT_SPLIT:%.*]] +; CHECK: cold_loop_exit.loopexit.split: +; CHECK-NEXT: br label [[COLD_LOOP_EXIT_LOOPEXIT]] ; CHECK: cold_loop_exit.loopexit: ; CHECK-NEXT: br label [[COLD_LOOP_EXIT]] ; CHECK: cold_loop_exit: @@ -36,17 +49,17 @@ entry: br label %entry_cold_loop entry_cold_loop: - br i1 %cold_cond, label %cold_loop_begin, label %cold_loop_exit, !prof !16 + br i1 %cold_cond, label %cold_loop_begin, label %cold_loop_exit, !prof !15 cold_loop_begin: br i1 %cond, label %cold_loop_a, label %cold_loop_b cold_loop_a: - call i32 @a() + %0 = call i32 @a() br label %cold_loop_latch cold_loop_b: - call i32 @b() + %1 = call i32 @b() br label %cold_loop_latch cold_loop_latch: @@ -57,21 +70,21 @@ cold_loop_exit: ret void } -!llvm.module.flags = !{!1} -!0 = !{!"function_entry_count", i64 400} -!1 = !{i32 1, !"ProfileSummary", !2} -!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} -!3 = !{!"ProfileFormat", !"InstrProf"} -!4 = !{!"TotalCount", i64 10000} -!5 = !{!"MaxCount", i64 10} -!6 = !{!"MaxInternalCount", i64 1} -!7 = !{!"MaxFunctionCount", i64 1000} -!8 = !{!"NumCounts", i64 3} -!9 = !{!"NumFunctions", i64 3} -!10 = !{!"DetailedSummary", !11} -!11 = !{!12, !13, !14} -!12 = !{i32 10000, i64 100, i32 1} -!13 = !{i32 999000, i64 100, i32 1} -!14 = !{i32 999999, i64 1, i32 2} -!15 = !{!"branch_weights", i32 100, i32 0} -!16 = !{!"branch_weights", i32 0, i32 100} +!llvm.module.flags = !{!0} + +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 400} +!15 = !{!"branch_weights", i32 0, i32 100} -- 2.7.4