From dfb40d3fd7a261d7e3f697242df4680f1e6780ff Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Sun, 19 Mar 2023 08:45:28 -0700 Subject: [PATCH] [SimpleLoopUnswitch] Skip non-trivial unswitching of cold loop nests This fixes a compile time issue due to guarding loop unswitching based on whether the enclosing function is cold. That approach is very inefficient in the case of large cold functions that contain numerous loops, since the loop pass calls isFunctionColdInCallGraph once per loop, and that function walks all BBs in the function (twice for Sample PGO) looking for any non-cold blocks. Originally, this code only checked if the current Loop's header was cold (D129599). However, that apparently caused a slowdown on a SPEC benchmark, and the example given was that of a cold inner loop nested in a non-cold outer loop (see comments in D129599). The fix was to check if the whole function is cold, done in D133275. This is overkill, and we can simply check if the header of any loop in the current loop's loop nest is non-cold (looking at both outer and inner loops). This patch drops the compile time for a large module by 40% with this approach. I also updated PGO-nontrivial-unswitch2.ll since it only had one cold loop in a non-cold function, so that it instead had IR based off the example given in the comments relating to the SPEC degradation in D129599. I confirmed that the new version of the test fails with the original check done in D129599 of only the current loop's header coldness. Similarly updated test PGO-nontrivial-unswitch.ll to contain a cold loop in a cold loop nest, and created PGO-nontrivial-unswitch3.ll to contain a non-cold loop in a non-cold loop nest. Differential Revision: https://reviews.llvm.org/D146383 --- llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 31 ++- .../SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll | 161 +++++++++----- .../SimpleLoopUnswitch/PGO-nontrivial-unswitch2.ll | 236 ++++++++++++++------- .../SimpleLoopUnswitch/PGO-nontrivial-unswitch3.ll | 184 ++++++++++++++++ 4 files changed, 485 insertions(+), 127 deletions(-) create mode 100644 llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch3.ll diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 204aaf7..819653c 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -3493,10 +3493,33 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, if (L.getHeader()->getParent()->hasOptSize()) return false; - // Skip cold loops, as unswitching them brings little benefit - // but increases the code size - if (PSI && PSI->hasProfileSummary() && BFI && - PSI->isFunctionColdInCallGraph(L.getHeader()->getParent(), *BFI)) { + // Returns true if Loop L's loop nest is cold, i.e. if the headers of L, + // of the loops L is nested in, and of the loops nested in L are all cold. + auto IsLoopNestCold = [&](const Loop *L) { + // Check L and all of its parent loops. + auto *Parent = L; + while (Parent) { + if (!PSI->isColdBlock(Parent->getHeader(), BFI)) + return false; + Parent = Parent->getParentLoop(); + } + // Next check all loops nested within L. + SmallVector Worklist; + Worklist.insert(Worklist.end(), L->getSubLoops().begin(), + L->getSubLoops().end()); + while (!Worklist.empty()) { + auto *CurLoop = Worklist.pop_back_val(); + if (!PSI->isColdBlock(CurLoop->getHeader(), BFI)) + return false; + Worklist.insert(Worklist.end(), CurLoop->getSubLoops().begin(), + CurLoop->getSubLoops().end()); + } + return true; + }; + + // Skip cold loops in cold loop nests, as unswitching them brings little + // benefit but increases the code size + if (PSI && PSI->hasProfileSummary() && BFI && IsLoopNestCold(&L)) { LLVM_DEBUG(dbgs() << " Skip cold loop: " << L << "\n"); return false; } diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll index daff716..f1ffcc7 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll @@ -1,65 +1,124 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt < %s -passes='require,function(loop-mssa(simple-loop-unswitch))' -S | FileCheck %s -; This test checks for a crash. -; RUN: opt < %s -passes=simple-loop-unswitch -aa-pipeline= -disable-output -declare i32 @a() -declare i32 @b() -; Check loops in cold functions will not be applied non-trivial loop unswitch -define void @f1(i32 %i, i1 %cond, i1 %hot_cond, i1 %cold_cond, ptr %ptr) !prof !0 { -; CHECK-LABEL: @f1( +;; Check that non-trivial loop unswitching is not applied to a cold loop in a +;; cold loop nest. + +;; IR was generated from the following loop nest, profiled when called +;; with M=0 and N=0. +;; void hotFunction(bool cond, int M, int N, int * A, int *B, int *C) { +;; for (unsigned j = 0; j < M; j++) +;; for (unsigned i=0; i < N; i++) { +;; A[i] = B[i] + C[i]; +;; if (cond) do_something(); +;; } +;; } + +define void @_Z11hotFunctionbiiPiS_S_(i1 %cond, i32 %M, i32 %N, ptr %A, ptr %B, ptr %C) !prof !36 { +; CHECK-LABEL: define void @_Z11hotFunctionbiiPiS_S_ +; CHECK-SAME: (i1 [[COND:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) !prof [[PROF16:![0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[COLD_LOOP_BEGIN:%.*]] -; CHECK: cold_loop_begin: -; CHECK-NEXT: br i1 [[COND:%.*]], label [[COLD_LOOP_A:%.*]], label [[COLD_LOOP_B:%.*]] -; CHECK: cold_loop_a: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @a() -; CHECK-NEXT: br label [[COLD_LOOP_LATCH:%.*]] -; CHECK: cold_loop_b: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @b() -; CHECK-NEXT: br label [[COLD_LOOP_LATCH]] -; CHECK: cold_loop_latch: -; CHECK-NEXT: [[V2:%.*]] = load i1, ptr [[PTR:%.*]], align 1 -; CHECK-NEXT: br i1 [[V2]], label [[COLD_LOOP_BEGIN]], label [[COLD_LOOP_EXIT:%.*]] -; CHECK: cold_loop_exit: +; CHECK-NEXT: [[CMP19_NOT:%.*]] = icmp eq i32 [[M]], 0 +; CHECK-NEXT: br i1 [[CMP19_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], !prof [[PROF17:![0-9]+]] +; CHECK: for.cond1.preheader.lr.ph: +; CHECK-NEXT: [[CMP217_NOT:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] +; CHECK: for.cond1.preheader: +; CHECK-NEXT: [[J_020:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LR_PH]] ], [ [[INC10:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] +; CHECK-NEXT: br i1 [[CMP217_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4_PREHEADER:%.*]] +; CHECK: for.body4.preheader: +; CHECK-NEXT: br label [[FOR_BODY4:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void +; CHECK: for.cond.cleanup3.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP3]] +; CHECK: for.cond.cleanup3: +; CHECK-NEXT: [[INC10]] = add nuw i32 [[J_020]], 1 +; CHECK-NEXT: [[EXITCOND22_NOT:%.*]] = icmp eq i32 [[INC10]], [[M]] +; CHECK-NEXT: br i1 [[EXITCOND22_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]], !prof [[PROF17]] +; CHECK: for.body4: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY4_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX8]], align 4 +; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK: if.then: +; CHECK-NEXT: tail call void @_Z12do_somethingv() +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3_LOOPEXIT:%.*]], label [[FOR_BODY4]] ; entry: - br label %cold_loop_begin + %cmp19.not = icmp eq i32 %M, 0 + br i1 %cmp19.not, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph, !prof !37 -cold_loop_begin: - br i1 %cond, label %cold_loop_a, label %cold_loop_b +for.cond1.preheader.lr.ph: + %cmp217.not = icmp eq i32 %N, 0 + br label %for.cond1.preheader -cold_loop_a: - call i32 @a() - br label %cold_loop_latch +for.cond1.preheader: + %j.020 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc10, %for.cond.cleanup3 ] + br i1 %cmp217.not, label %for.cond.cleanup3, label %for.body4 -cold_loop_b: - call i32 @b() - br label %cold_loop_latch +for.cond.cleanup: + ret void -cold_loop_latch: - %v2 = load i1, ptr %ptr - br i1 %v2, label %cold_loop_begin, label %cold_loop_exit +for.cond.cleanup3: + %inc10 = add nuw i32 %j.020, 1 + %exitcond22.not = icmp eq i32 %inc10, %M + br i1 %exitcond22.not, label %for.cond.cleanup, label %for.cond1.preheader, !prof !37 -cold_loop_exit: - ret void +for.body4: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.cond1.preheader ] + %arrayidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx6 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv + %1 = load i32, ptr %arrayidx6, align 4 + %add = add nsw i32 %1, %0 + %arrayidx8 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + store i32 %add, ptr %arrayidx8, align 4 + br i1 %cond, label %if.then, label %for.inc + +if.then: + tail call void @_Z12do_somethingv() + br label %for.inc + +for.inc: + %wide.trip.count = zext i32 %N to i64 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4 } -!llvm.module.flags = !{!1} -!0 = !{!"function_entry_count", i64 0} -!1 = !{i32 1, !"ProfileSummary", !2} -!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} -!3 = !{!"ProfileFormat", !"InstrProf"} -!4 = !{!"TotalCount", i64 10000} -!5 = !{!"MaxCount", i64 10} -!6 = !{!"MaxInternalCount", i64 1} -!7 = !{!"MaxFunctionCount", i64 1000} -!8 = !{!"NumCounts", i64 3} -!9 = !{!"NumFunctions", i64 3} -!10 = !{!"DetailedSummary", !11} -!11 = !{!12, !13, !14} -!12 = !{i32 10000, i64 100, i32 1} -!13 = !{i32 999000, i64 100, i32 1} -!14 = !{i32 999999, i64 1, i32 2} +declare void @_Z12do_somethingv() + +!llvm.module.flags = !{!6} + +!6 = !{i32 1, !"ProfileSummary", !7} +!7 = !{!8, !9, !10, !11, !12, !13, !14, !15, !16, !17} +!8 = !{!"ProfileFormat", !"InstrProf"} +!9 = !{!"TotalCount", i64 1002} +!10 = !{!"MaxCount", i64 1000} +!11 = !{!"MaxInternalCount", i64 1000} +!12 = !{!"MaxFunctionCount", i64 1} +!13 = !{!"NumCounts", i64 6} +!14 = !{!"NumFunctions", i64 3} +!15 = !{!"IsPartialProfile", i64 0} +!16 = !{!"PartialProfileRatio", double 0.000000e+00} +!17 = !{!"DetailedSummary", !18} +!18 = !{!19, !31, !34} +!19 = !{i32 10000, i64 1000, i32 1} +!31 = !{i32 999000, i64 1000, i32 1} +!34 = !{i32 999999, i64 1, i32 3} +!36 = !{!"function_entry_count", i64 1} +!37 = !{!"branch_weights", i32 1, i32 0} diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch2.ll b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch2.ll index 2f7acac..ad674ed 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch2.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch2.ll @@ -1,90 +1,182 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt < %s -passes='require,function(loop-mssa(simple-loop-unswitch))' -S | FileCheck %s -declare i32 @a() -declare i32 @b() +;; Check that non-trivial loop unswitching is applied to a cold loop in a +;; non-cold loop nest. -; Check loops will be applied non-trivial loop unswitch in a non-cold function, -; even loop headers are cold +;; IR was generated from the following loop nest, profiled when called +;; with M=1000 and N=0. +;; void hotFunction(bool cond, int M, int N, int * A, int *B, int *C) { +;; for (unsigned j = 0; j < M; j++) +;; for (unsigned i=0; i < N; i++) { +;; A[i] = B[i] + C[i]; +;; if (cond) do_something(); +;; } +;; } -define void @f1(i32 %i, i1 %cond, i1 %hot_cond, i1 %cold_cond, ptr %ptr) !prof !14 { -; CHECK-LABEL: @f1( +define void @_Z11hotFunctionbiiPiS_S_(i1 %cond, i32 %M, i32 %N, ptr %A, ptr %B, ptr %C) !prof !36 { +; CHECK-LABEL: define void @_Z11hotFunctionbiiPiS_S_ +; CHECK-SAME: (i1 [[COND:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) !prof [[PROF33:![0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[ENTRY_COLD_LOOP:%.*]] -; CHECK: entry_cold_loop: -; CHECK-NEXT: br i1 [[COLD_COND:%.*]], label [[COLD_LOOP_BEGIN_PREHEADER:%.*]], label [[COLD_LOOP_EXIT:%.*]], !prof [[PROF15:![0-9]+]] -; CHECK: cold_loop_begin.preheader: -; CHECK-NEXT: br i1 [[COND:%.*]], label [[COLD_LOOP_BEGIN_PREHEADER_SPLIT_US:%.*]], label [[COLD_LOOP_BEGIN_PREHEADER_SPLIT:%.*]] -; CHECK: cold_loop_begin.preheader.split.us: -; CHECK-NEXT: br label [[COLD_LOOP_BEGIN_US:%.*]] -; CHECK: cold_loop_begin.us: -; CHECK-NEXT: br label [[COLD_LOOP_A_US:%.*]] -; CHECK: cold_loop_a.us: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @a() -; CHECK-NEXT: br label [[COLD_LOOP_LATCH_US:%.*]] -; CHECK: cold_loop_latch.us: -; CHECK-NEXT: [[V2_US:%.*]] = load i1, ptr [[PTR:%.*]], align 1 -; CHECK-NEXT: br i1 [[V2_US]], label [[COLD_LOOP_BEGIN_US]], label [[COLD_LOOP_EXIT_LOOPEXIT_SPLIT_US:%.*]] -; CHECK: cold_loop_exit.loopexit.split.us: -; CHECK-NEXT: br label [[COLD_LOOP_EXIT_LOOPEXIT:%.*]] -; CHECK: cold_loop_begin.preheader.split: -; CHECK-NEXT: br label [[COLD_LOOP_BEGIN:%.*]] -; CHECK: cold_loop_begin: -; CHECK-NEXT: br label [[COLD_LOOP_B:%.*]] -; CHECK: cold_loop_b: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @b() -; CHECK-NEXT: br label [[COLD_LOOP_LATCH:%.*]] -; CHECK: cold_loop_latch: -; CHECK-NEXT: [[V2:%.*]] = load i1, ptr [[PTR]], align 1 -; CHECK-NEXT: br i1 [[V2]], label [[COLD_LOOP_BEGIN]], label [[COLD_LOOP_EXIT_LOOPEXIT_SPLIT:%.*]] -; CHECK: cold_loop_exit.loopexit.split: -; CHECK-NEXT: br label [[COLD_LOOP_EXIT_LOOPEXIT]] -; CHECK: cold_loop_exit.loopexit: -; CHECK-NEXT: br label [[COLD_LOOP_EXIT]] -; CHECK: cold_loop_exit: +; CHECK-NEXT: [[CMP19_NOT:%.*]] = icmp eq i32 [[M]], 0 +; CHECK-NEXT: br i1 [[CMP19_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], !prof [[PROF34:![0-9]+]] +; CHECK: for.cond1.preheader.lr.ph: +; CHECK-NEXT: [[CMP217_NOT:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP217_NOT]], label [[FOR_COND1_PREHEADER_LR_PH_SPLIT_US:%.*]], label [[FOR_COND1_PREHEADER_LR_PH_SPLIT:%.*]], !prof [[PROF35:![0-9]+]] +; CHECK: for.cond1.preheader.lr.ph.split.us: +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] +; CHECK: for.cond1.preheader.us: +; CHECK-NEXT: [[J_020_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LR_PH_SPLIT_US]] ], [ [[INC10_US:%.*]], [[FOR_COND_CLEANUP3_US:%.*]] ] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP3_US]] +; CHECK: for.cond.cleanup3.us: +; CHECK-NEXT: [[INC10_US]] = add nuw i32 [[J_020_US]], 1 +; CHECK-NEXT: [[EXITCOND22_NOT_US:%.*]] = icmp eq i32 [[INC10_US]], [[M]] +; CHECK-NEXT: br i1 [[EXITCOND22_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_COND1_PREHEADER_US]], !prof [[PROF34]] +; CHECK: for.cond.cleanup.loopexit.split.us: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] +; CHECK: for.cond1.preheader.lr.ph.split: +; CHECK-NEXT: br i1 [[COND]], label [[FOR_COND1_PREHEADER_LR_PH_SPLIT_SPLIT_US:%.*]], label [[FOR_COND1_PREHEADER_LR_PH_SPLIT_SPLIT:%.*]] +; CHECK: for.cond1.preheader.lr.ph.split.split.us: +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US1:%.*]] +; CHECK: for.cond1.preheader.us1: +; CHECK-NEXT: [[J_020_US2:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LR_PH_SPLIT_SPLIT_US]] ], [ [[INC10_US4:%.*]], [[FOR_COND_CLEANUP3_US3:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY4_PREHEADER_US:%.*]] +; CHECK: for.cond.cleanup3.us3: +; CHECK-NEXT: [[INC10_US4]] = add nuw i32 [[J_020_US2]], 1 +; CHECK-NEXT: [[EXITCOND22_NOT_US5:%.*]] = icmp eq i32 [[INC10_US4]], [[M]] +; CHECK-NEXT: br i1 [[EXITCOND22_NOT_US5]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_SPLIT_US:%.*]], label [[FOR_COND1_PREHEADER_US1]], !prof [[PROF34]] +; CHECK: for.body4.preheader.us: +; CHECK-NEXT: br label [[FOR_BODY4_PREHEADER_SPLIT_US_US:%.*]] +; CHECK: for.cond.cleanup3.loopexit.us: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP3_US3]] +; CHECK: for.body4.preheader.split.us.us: +; CHECK-NEXT: br label [[FOR_BODY4_US_US:%.*]] +; CHECK: for.body4.us.us: +; CHECK-NEXT: [[INDVARS_IV_US_US:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_US_US:%.*]], [[FOR_INC_US_US:%.*]] ], [ 0, [[FOR_BODY4_PREHEADER_SPLIT_US_US]] ] +; CHECK-NEXT: [[ARRAYIDX_US_US:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV_US_US]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX_US_US]], align 4 +; CHECK-NEXT: [[ARRAYIDX6_US_US:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV_US_US]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX6_US_US]], align 4 +; CHECK-NEXT: [[ADD_US_US:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX8_US_US:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV_US_US]] +; CHECK-NEXT: store i32 [[ADD_US_US]], ptr [[ARRAYIDX8_US_US]], align 4 +; CHECK-NEXT: br label [[IF_THEN_US_US:%.*]] +; CHECK: if.then.us.us: +; CHECK-NEXT: tail call void @_Z12do_somethingv() +; CHECK-NEXT: br label [[FOR_INC_US_US]] +; CHECK: for.inc.us.us: +; CHECK-NEXT: [[WIDE_TRIP_COUNT_US_US:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[INDVARS_IV_NEXT_US_US]] = add nuw nsw i64 [[INDVARS_IV_US_US]], 1 +; CHECK-NEXT: [[EXITCOND_NOT_US_US:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_US_US]], [[WIDE_TRIP_COUNT_US_US]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_US_US]], label [[FOR_COND_CLEANUP3_LOOPEXIT_SPLIT_US_US:%.*]], label [[FOR_BODY4_US_US]], !prof [[PROF35]] +; CHECK: for.cond.cleanup3.loopexit.split.us.us: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP3_LOOPEXIT_US:%.*]] +; CHECK: for.cond.cleanup.loopexit.split.split.us: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT:%.*]] +; CHECK: for.cond1.preheader.lr.ph.split.split: +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] +; CHECK: for.cond1.preheader: +; CHECK-NEXT: [[J_020:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LR_PH_SPLIT_SPLIT]] ], [ [[INC10:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY4_PREHEADER:%.*]] +; CHECK: for.body4.preheader: +; CHECK-NEXT: br label [[FOR_BODY4_PREHEADER_SPLIT:%.*]] +; CHECK: for.body4.preheader.split: +; CHECK-NEXT: br label [[FOR_BODY4:%.*]] +; CHECK: for.cond.cleanup.loopexit.split.split: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT]] +; CHECK: for.cond.cleanup.loopexit.split: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void +; CHECK: for.cond.cleanup3.loopexit.split: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP3_LOOPEXIT:%.*]] +; CHECK: for.cond.cleanup3.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP3]] +; CHECK: for.cond.cleanup3: +; CHECK-NEXT: [[INC10]] = add nuw i32 [[J_020]], 1 +; CHECK-NEXT: [[EXITCOND22_NOT:%.*]] = icmp eq i32 [[INC10]], [[M]] +; CHECK-NEXT: br i1 [[EXITCOND22_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_SPLIT:%.*]], label [[FOR_COND1_PREHEADER]], !prof [[PROF34]] +; CHECK: for.body4: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY4_PREHEADER_SPLIT]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX8]], align 4 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3_LOOPEXIT_SPLIT:%.*]], label [[FOR_BODY4]], !prof [[PROF35]] ; entry: - br label %entry_cold_loop + %cmp19.not = icmp eq i32 %M, 0 + br i1 %cmp19.not, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph, !prof !37 -entry_cold_loop: - br i1 %cold_cond, label %cold_loop_begin, label %cold_loop_exit, !prof !15 +for.cond1.preheader.lr.ph: + %cmp217.not = icmp eq i32 %N, 0 + br label %for.cond1.preheader -cold_loop_begin: - br i1 %cond, label %cold_loop_a, label %cold_loop_b +for.cond1.preheader: + %j.020 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc10, %for.cond.cleanup3 ] + br i1 %cmp217.not, label %for.cond.cleanup3, label %for.body4, !prof !38 -cold_loop_a: - %0 = call i32 @a() - br label %cold_loop_latch +for.cond.cleanup: + ret void -cold_loop_b: - %1 = call i32 @b() - br label %cold_loop_latch +for.cond.cleanup3: + %inc10 = add nuw i32 %j.020, 1 + %exitcond22.not = icmp eq i32 %inc10, %M + br i1 %exitcond22.not, label %for.cond.cleanup, label %for.cond1.preheader, !prof !37 -cold_loop_latch: - %v2 = load i1, ptr %ptr - br i1 %v2, label %cold_loop_begin, label %cold_loop_exit +for.body4: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.cond1.preheader ] + %arrayidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx6 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv + %1 = load i32, ptr %arrayidx6, align 4 + %add = add nsw i32 %1, %0 + %arrayidx8 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + store i32 %add, ptr %arrayidx8, align 4 + br i1 %cond, label %if.then, label %for.inc -cold_loop_exit: - ret void +if.then: + tail call void @_Z12do_somethingv() + br label %for.inc + +for.inc: + %wide.trip.count = zext i32 %N to i64 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4, !prof !38 } -!llvm.module.flags = !{!0} +declare void @_Z12do_somethingv() + +!llvm.module.flags = !{!6} -!0 = !{i32 1, !"ProfileSummary", !1} -!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} -!2 = !{!"ProfileFormat", !"InstrProf"} -!3 = !{!"TotalCount", i64 10000} -!4 = !{!"MaxCount", i64 10} -!5 = !{!"MaxInternalCount", i64 1} -!6 = !{!"MaxFunctionCount", i64 1000} -!7 = !{!"NumCounts", i64 3} -!8 = !{!"NumFunctions", i64 3} -!9 = !{!"DetailedSummary", !10} -!10 = !{!11, !12, !13} -!11 = !{i32 10000, i64 100, i32 1} -!12 = !{i32 999000, i64 100, i32 1} -!13 = !{i32 999999, i64 1, i32 2} -!14 = !{!"function_entry_count", i64 400} -!15 = !{!"branch_weights", i32 0, i32 100} +!6 = !{i32 1, !"ProfileSummary", !7} +!7 = !{!8, !9, !10, !11, !12, !13, !14, !15, !16, !17} +!8 = !{!"ProfileFormat", !"InstrProf"} +!9 = !{!"TotalCount", i64 1002} +!10 = !{!"MaxCount", i64 1000} +!11 = !{!"MaxInternalCount", i64 1000} +!12 = !{!"MaxFunctionCount", i64 1} +!13 = !{!"NumCounts", i64 6} +!14 = !{!"NumFunctions", i64 3} +!15 = !{!"IsPartialProfile", i64 0} +!16 = !{!"PartialProfileRatio", double 0.000000e+00} +!17 = !{!"DetailedSummary", !18} +!18 = !{!19, !31, !34} +!19 = !{i32 10000, i64 1000, i32 1} +!31 = !{i32 999000, i64 1000, i32 1} +!34 = !{i32 999999, i64 1, i32 3} +!36 = !{!"function_entry_count", i64 1} +!37 = !{!"branch_weights", i32 1, i32 1000} +!38 = !{!"branch_weights", i32 1000, i32 0} diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch3.ll b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch3.ll new file mode 100644 index 0000000..59b8404 --- /dev/null +++ b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch3.ll @@ -0,0 +1,184 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 + +; RUN: opt < %s -passes='require,function(loop-mssa(simple-loop-unswitch))' -S | FileCheck %s + +;; Check that non-trivial loop unswitching is applied to a non-cold loop in a +;; non-cold loop nest. + +;; IR was generated from the following loop nest, profiled when called +;; with M=1000 and N=10. +;; void hotFunction(bool cond, int M, int N, int * A, int *B, int *C) { +;; for (unsigned j = 0; j < M; j++) +;; for (unsigned i=0; i < N; i++) { +;; A[i] = B[i] + C[i]; +;; if (cond) do_something(); +;; } +;; } + +define void @_Z11hotFunctionbiiPiS_S_(i1 %cond, i32 %M, i32 %N, ptr %A, ptr %B, ptr %C) !prof !36 { +; CHECK-LABEL: define void @_Z11hotFunctionbiiPiS_S_ +; CHECK-SAME: (i1 [[COND:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) !prof [[PROF18:![0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP19_NOT:%.*]] = icmp eq i32 [[M]], 0 +; CHECK-NEXT: br i1 [[CMP19_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], !prof [[PROF19:![0-9]+]] +; CHECK: for.cond1.preheader.lr.ph: +; CHECK-NEXT: [[CMP217_NOT:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP217_NOT]], label [[FOR_COND1_PREHEADER_LR_PH_SPLIT_US:%.*]], label [[FOR_COND1_PREHEADER_LR_PH_SPLIT:%.*]], !prof [[PROF20:![0-9]+]] +; CHECK: for.cond1.preheader.lr.ph.split.us: +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] +; CHECK: for.cond1.preheader.us: +; CHECK-NEXT: [[J_020_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LR_PH_SPLIT_US]] ], [ [[INC10_US:%.*]], [[FOR_COND_CLEANUP3_US:%.*]] ] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP3_US]] +; CHECK: for.cond.cleanup3.us: +; CHECK-NEXT: [[INC10_US]] = add nuw i32 [[J_020_US]], 1 +; CHECK-NEXT: [[EXITCOND22_NOT_US:%.*]] = icmp eq i32 [[INC10_US]], [[M]] +; CHECK-NEXT: br i1 [[EXITCOND22_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_COND1_PREHEADER_US]], !prof [[PROF19]] +; CHECK: for.cond.cleanup.loopexit.split.us: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] +; CHECK: for.cond1.preheader.lr.ph.split: +; CHECK-NEXT: br i1 [[COND]], label [[FOR_COND1_PREHEADER_LR_PH_SPLIT_SPLIT_US:%.*]], label [[FOR_COND1_PREHEADER_LR_PH_SPLIT_SPLIT:%.*]] +; CHECK: for.cond1.preheader.lr.ph.split.split.us: +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US1:%.*]] +; CHECK: for.cond1.preheader.us1: +; CHECK-NEXT: [[J_020_US2:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LR_PH_SPLIT_SPLIT_US]] ], [ [[INC10_US4:%.*]], [[FOR_COND_CLEANUP3_US3:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY4_PREHEADER_US:%.*]] +; CHECK: for.cond.cleanup3.us3: +; CHECK-NEXT: [[INC10_US4]] = add nuw i32 [[J_020_US2]], 1 +; CHECK-NEXT: [[EXITCOND22_NOT_US5:%.*]] = icmp eq i32 [[INC10_US4]], [[M]] +; CHECK-NEXT: br i1 [[EXITCOND22_NOT_US5]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_SPLIT_US:%.*]], label [[FOR_COND1_PREHEADER_US1]], !prof [[PROF19]] +; CHECK: for.body4.preheader.us: +; CHECK-NEXT: br label [[FOR_BODY4_PREHEADER_SPLIT_US_US:%.*]] +; CHECK: for.cond.cleanup3.loopexit.us: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP3_US3]] +; CHECK: for.body4.preheader.split.us.us: +; CHECK-NEXT: br label [[FOR_BODY4_US_US:%.*]] +; CHECK: for.body4.us.us: +; CHECK-NEXT: [[INDVARS_IV_US_US:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_US_US:%.*]], [[FOR_INC_US_US:%.*]] ], [ 0, [[FOR_BODY4_PREHEADER_SPLIT_US_US]] ] +; CHECK-NEXT: [[ARRAYIDX_US_US:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV_US_US]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX_US_US]], align 4 +; CHECK-NEXT: [[ARRAYIDX6_US_US:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV_US_US]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX6_US_US]], align 4 +; CHECK-NEXT: [[ADD_US_US:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX8_US_US:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV_US_US]] +; CHECK-NEXT: store i32 [[ADD_US_US]], ptr [[ARRAYIDX8_US_US]], align 4 +; CHECK-NEXT: br label [[IF_THEN_US_US:%.*]] +; CHECK: if.then.us.us: +; CHECK-NEXT: tail call void @_Z12do_somethingv() +; CHECK-NEXT: br label [[FOR_INC_US_US]] +; CHECK: for.inc.us.us: +; CHECK-NEXT: [[WIDE_TRIP_COUNT_US_US:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[INDVARS_IV_NEXT_US_US]] = add nuw nsw i64 [[INDVARS_IV_US_US]], 1 +; CHECK-NEXT: [[EXITCOND_NOT_US_US:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_US_US]], [[WIDE_TRIP_COUNT_US_US]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_US_US]], label [[FOR_COND_CLEANUP3_LOOPEXIT_SPLIT_US_US:%.*]], label [[FOR_BODY4_US_US]], !prof [[PROF20]] +; CHECK: for.cond.cleanup3.loopexit.split.us.us: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP3_LOOPEXIT_US:%.*]] +; CHECK: for.cond.cleanup.loopexit.split.split.us: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT:%.*]] +; CHECK: for.cond1.preheader.lr.ph.split.split: +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] +; CHECK: for.cond1.preheader: +; CHECK-NEXT: [[J_020:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LR_PH_SPLIT_SPLIT]] ], [ [[INC10:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY4_PREHEADER:%.*]] +; CHECK: for.body4.preheader: +; CHECK-NEXT: br label [[FOR_BODY4_PREHEADER_SPLIT:%.*]] +; CHECK: for.body4.preheader.split: +; CHECK-NEXT: br label [[FOR_BODY4:%.*]] +; CHECK: for.cond.cleanup.loopexit.split.split: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT]] +; CHECK: for.cond.cleanup.loopexit.split: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.cond.cleanup3.loopexit.split: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP3_LOOPEXIT:%.*]] +; CHECK: for.cond.cleanup3.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP3]] +; CHECK: for.cond.cleanup3: +; CHECK-NEXT: [[INC10]] = add nuw i32 [[J_020]], 1 +; CHECK-NEXT: [[EXITCOND22_NOT:%.*]] = icmp eq i32 [[INC10]], [[M]] +; CHECK-NEXT: br i1 [[EXITCOND22_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_SPLIT:%.*]], label [[FOR_COND1_PREHEADER]], !prof [[PROF19]] +; CHECK: for.body4: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY4_PREHEADER_SPLIT]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX8]], align 4 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3_LOOPEXIT_SPLIT:%.*]], label [[FOR_BODY4]], !prof [[PROF20]] +; +entry: + %cmp19.not = icmp eq i32 %M, 0 + br i1 %cmp19.not, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph, !prof !37 + +for.cond1.preheader.lr.ph: + %cmp217.not = icmp eq i32 %N, 0 + br label %for.cond1.preheader + +for.cond1.preheader: + %j.020 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc10, %for.cond.cleanup3 ] + br i1 %cmp217.not, label %for.cond.cleanup3, label %for.body4, !prof !38 + +for.cond.cleanup: + ret void + +for.cond.cleanup3: + %inc10 = add nuw i32 %j.020, 1 + %exitcond22.not = icmp eq i32 %inc10, %M + br i1 %exitcond22.not, label %for.cond.cleanup, label %for.cond1.preheader, !prof !37 + +for.body4: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.cond1.preheader ] + %arrayidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx6 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv + %1 = load i32, ptr %arrayidx6, align 4 + %add = add nsw i32 %1, %0 + %arrayidx8 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + store i32 %add, ptr %arrayidx8, align 4 + br i1 %cond, label %if.then, label %for.inc + +if.then: + tail call void @_Z12do_somethingv() + br label %for.inc + +for.inc: + %wide.trip.count = zext i32 %N to i64 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4, !prof !38 +} + +declare void @_Z12do_somethingv() + +!llvm.module.flags = !{!6} + +!6 = !{i32 1, !"ProfileSummary", !7} +!7 = !{!8, !9, !10, !11, !12, !13, !14, !15, !16, !17} +!8 = !{!"ProfileFormat", !"InstrProf"} +!9 = !{!"TotalCount", i64 1002} +!10 = !{!"MaxCount", i64 1000} +!11 = !{!"MaxInternalCount", i64 1000} +!12 = !{!"MaxFunctionCount", i64 1} +!13 = !{!"NumCounts", i64 6} +!14 = !{!"NumFunctions", i64 3} +!15 = !{!"IsPartialProfile", i64 0} +!16 = !{!"PartialProfileRatio", double 0.000000e+00} +!17 = !{!"DetailedSummary", !18} +!18 = !{!19, !29, !30, !32, !34} +!19 = !{i32 10000, i64 10000, i32 3} +!29 = !{i32 950000, i64 10000, i32 3} +!30 = !{i32 990000, i64 1000, i32 4} +!32 = !{i32 999900, i64 1000, i32 4} +!34 = !{i32 999999, i64 1, i32 6} +!36 = !{!"function_entry_count", i64 1} +!37 = !{!"branch_weights", i32 1, i32 1000} +!38 = !{!"branch_weights", i32 1000, i32 10000} -- 2.7.4