From 3c42f1c3c90c5c20f3539048f1ed3f685cc914c6 Mon Sep 17 00:00:00 2001 From: Yaxun Liu Date: Fri, 2 Mar 2018 16:22:32 +0000 Subject: [PATCH] LoopUnroll: respect pragma unroll when AllowRemainder is disabled Currently when AllowRemainder is disabled, pragma unroll count is not respected even though there is no remainder. This bug causes a loop fully unrolled in many cases even though the user specifies a unroll count. Especially it affects OpenCL/CUDA since in many cases a loop contains convergent instructions and currently AllowRemainder is disabled for such loops. Differential Revision: https://reviews.llvm.org/D43826 llvm-svn: 326585 --- llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 2 +- llvm/test/Transforms/LoopUnroll/convergent.ll | 96 +++++++++++++++++++++++ llvm/test/Transforms/LoopUnroll/unroll-pragmas.ll | 63 +++++++++------ 3 files changed, 134 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 15e7da5..1e3bd25 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -729,7 +729,7 @@ static bool computeUnrollCount( UP.Runtime = true; UP.AllowExpensiveTripCount = true; UP.Force = true; - if (UP.AllowRemainder && + if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) && getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold) return true; } diff --git a/llvm/test/Transforms/LoopUnroll/convergent.ll b/llvm/test/Transforms/LoopUnroll/convergent.ll index 4109e96..8417c37 100644 --- a/llvm/test/Transforms/LoopUnroll/convergent.ll +++ b/llvm/test/Transforms/LoopUnroll/convergent.ll @@ -80,4 +80,100 @@ exit: ret i32 0 } +; This loop contains a convergent instruction. Since the pragma loop unroll +; count 2 divides trip count 4. The loop unroll should respect the pragma. +; CHECK-LABEL: @pragma_unroll_divisible_trip_count +define void @pragma_unroll_divisible_trip_count() { +entry: + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK-NOT: call void @f() + call void @f() convergent + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 4 + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret void +} + +; This loop contains a convergent instruction. Since the pragma loop unroll +; count 2 divides trip multiple 2. The loop unroll should respect the pragma. +; CHECK-LABEL: @pragma_unroll_divisible_trip_multiple +define i32 @pragma_unroll_divisible_trip_multiple(i32 %n) { +entry: + %loop_ctl = mul nsw i32 %n, 2 + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK-NOT: call void @f() + call void @f() convergent + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %loop_ctl + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction. Since the pragma loop unroll +; count 2 is unknown to divide runtime trip count, the loop is not unrolled +; since remainder is forbidden for unrolling convergent loop. +; ToDo: Forbidding remainder for unrolling convergent loop may be relaxed +; in the future. +; CHECK-LABEL: @pragma_unroll_indivisible_runtime_trip_count +define i32 @pragma_unroll_indivisible_runtime_trip_count(i32 %n) { +entry: + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] +; CHECK: call void @f() +; CHECK-NOT: call void @f() + call void @f() convergent + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction. Since the pragma loop unroll +; count 2 does not divide trip count 5, the loop is not unrolled by 2 +; since remainder is forbidden for unrolling convergent loop. Instead, the +; loop gets fully unrolled. +; ToDo: Forbidding remainder for unrolling convergent loop may be relaxed +; in the future. +; CHECK-LABEL: @pragma_unroll_indivisible_trip_count +define i32 @pragma_unroll_indivisible_trip_count() { +entry: + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK-NOT: call void @f() + call void @f() convergent + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 5 + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret i32 0 +} + !0 = !{!0, !{!"llvm.loop.unroll.count", i32 16}} +!1 = !{!1, !{!"llvm.loop.unroll.count", i32 2}} + diff --git a/llvm/test/Transforms/LoopUnroll/unroll-pragmas.ll b/llvm/test/Transforms/LoopUnroll/unroll-pragmas.ll index 88f32c9..afc70fb 100644 --- a/llvm/test/Transforms/LoopUnroll/unroll-pragmas.ll +++ b/llvm/test/Transforms/LoopUnroll/unroll-pragmas.ll @@ -1,5 +1,6 @@ -; RUN: opt < %s -loop-unroll -pragma-unroll-threshold=1024 -S | FileCheck %s -; RUN: opt < %s -loop-unroll -loop-unroll -pragma-unroll-threshold=1024 -S | FileCheck %s +; RUN: opt < %s -loop-unroll -pragma-unroll-threshold=1024 -S | FileCheck -check-prefixes=CHECK,REM %s +; RUN: opt < %s -loop-unroll -loop-unroll -pragma-unroll-threshold=1024 -S | FileCheck -check-prefixes=CHECK,REM %s +; RUN: opt < %s -loop-unroll -unroll-allow-remainder=0 -pragma-unroll-threshold=1024 -S | FileCheck -check-prefixes=CHECK,NOREM %s ; ; Run loop unrolling twice to verify that loop unrolling metadata is properly ; removed and further unrolling is disabled after the pass is run once. @@ -168,20 +169,24 @@ for.end: ; preds = %for.body, %entry ; #pragma clang loop unroll_count(4) ; Loop has a runtime trip count. Runtime unrolling should occur and loop -; should be duplicated (original and 4x unrolled). +; should be duplicated (original and 4x unrolled) if remainder is allowed, +; otherwise loop should not be unrolled. ; ; CHECK-LABEL: @runtime_loop_with_count4( ; CHECK: for.body ; CHECK: store -; CHECK: store -; CHECK: store -; CHECK: store +; REM: store +; REM: store +; REM: store ; CHECK-NOT: store ; CHECK: br i1 -; CHECK: for.body.epil: -; CHECK: store +; REM: for.body.epil: +; REM: store +; NOREM-NOT: for.body.epil: +; NOREM-NOT: store ; CHECK-NOT: store -; CHECK: br i1 +; REM: br i1 +; NOREM-NOT: br i1 define void @runtime_loop_with_count4(i32* nocapture %a, i32 %b) { entry: %cmp3 = icmp sgt i32 %b, 0 @@ -284,24 +289,27 @@ for.end: ; preds = %for.body ; #pragma clang loop unroll(enable) ; Loop has a runtime trip count and should be runtime unrolled and duplicated -; (original and 8x). +; (original and 8x) if remainder is allowed, otherwise it should not be +; unrolled. ; ; CHECK-LABEL: @runtime_loop_with_enable( ; CHECK: for.body: ; CHECK: store i32 -; CHECK: store i32 -; CHECK: store i32 -; CHECK: store i32 -; CHECK: store i32 -; CHECK: store i32 -; CHECK: store i32 -; CHECK: store i32 +; REM: store i32 +; REM: store i32 +; REM: store i32 +; REM: store i32 +; REM: store i32 +; REM: store i32 +; REM: store i32 ; CHECK-NOT: store i32 ; CHECK: br i1 -; CHECK: for.body.epil: -; CHECK: store +; REM: for.body.epil: +; NOREM-NOT: for.body.epil: +; REM: store ; CHECK-NOT: store -; CHECK: br i1 +; REM: br i1 +; NOREM-NOT: br i1 define void @runtime_loop_with_enable(i32* nocapture %a, i32 %b) { entry: %cmp3 = icmp sgt i32 %b, 0 @@ -325,19 +333,22 @@ for.end: ; preds = %for.body, %entry ; #pragma clang loop unroll_count(3) ; Loop has a runtime trip count. Runtime unrolling should occur and loop -; should be duplicated (original and 3x unrolled). +; should be duplicated (original and 3x unrolled) if remainder is allowed, +; otherwise it should not be unrolled. ; ; CHECK-LABEL: @runtime_loop_with_count3( ; CHECK: for.body ; CHECK: store -; CHECK: store -; CHECK: store +; REM: store +; REM: store ; CHECK-NOT: store ; CHECK: br i1 -; CHECK: for.body.epil: -; CHECK: store +; REM: for.body.epil: +; REM: store +; NOREM-NOT: for.body.epil: +; NOREM-NOT: store ; CHECK-NOT: store -; CHECK: br i1 +; REM: br i1 define void @runtime_loop_with_count3(i32* nocapture %a, i32 %b) { entry: %cmp3 = icmp sgt i32 %b, 0 -- 2.7.4