Reapply "[LoopUnroll] Use the upper bound of the loop trip count to fullly unroll...
authorHaicheng Wu <haicheng@codeaurora.org>
Wed, 12 Oct 2016 21:29:38 +0000 (21:29 +0000)
committerHaicheng Wu <haicheng@codeaurora.org>
Wed, 12 Oct 2016 21:29:38 +0000 (21:29 +0000)
Reappy r284044 after revert in r284051. Krzysztof fixed the error in r284049.

The original summary:

This patch tries to fully unroll loops having break statement like this

for (int i = 0; i < 8; i++) {
    if (a[i] == value) {
        found = true;
        break;
    }
}

GCC can fully unroll such loops, but currently LLVM cannot because LLVM only
supports loops having exact constant trip counts.

The upper bound of the trip count can be obtained from calling
ScalarEvolution::getMaxBackedgeTakenCount(). Part of the patch is the
refactoring work in SCEV to prevent duplicating code.

The feature of using the upper bound is enabled under the same circumstance
when runtime unrolling is enabled since both are used to unroll loops without
knowing the exact constant trip count.

llvm-svn: 284053

llvm/include/llvm/Analysis/ScalarEvolution.h
llvm/include/llvm/Analysis/TargetTransformInfo.h
llvm/include/llvm/CodeGen/BasicTTIImpl.h
llvm/include/llvm/Transforms/Scalar.h
llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h
llvm/include/llvm/Transforms/Utils/UnrollLoop.h
llvm/lib/Analysis/ScalarEvolution.cpp
llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
llvm/lib/Transforms/Utils/LoopUnroll.cpp
llvm/test/Transforms/LoopUnroll/AArch64/full-unroll-trip-count-upper-bound.ll [new file with mode: 0644]

index ac3ed68..3f92a32 100644 (file)
@@ -1330,6 +1330,11 @@ public:
   /// prematurely via another branch.
   unsigned getSmallConstantTripCount(Loop *L, BasicBlock *ExitingBlock);
 
+  /// Returns the upper bound of the loop trip count as a normal unsigned
+  /// value.
+  /// Returns 0 if the trip count is unknown or not constant.
+  unsigned getSmallConstantMaxTripCount(Loop *L);
+
   /// Returns the largest constant divisor of the trip count of the
   /// loop if it is a single-exit loop and we can compute a small maximum for
   /// that loop.
index c8224b3..0e96572 100644 (file)
@@ -290,6 +290,8 @@ public:
     /// Apply loop unroll on any kind of loop
     /// (mainly to loops that fail runtime unrolling).
     bool Force;
+    /// Allow using trip count upper bound to unroll loops.
+    bool UpperBound;
   };
 
   /// \brief Get target-customized preferences for the generic loop unrolling
index b1a29d5..53b1c4d 100644 (file)
@@ -284,7 +284,8 @@ public:
     }
 
     // Enable runtime and partial unrolling up to the specified size.
-    UP.Partial = UP.Runtime = true;
+    // Enable using trip count upper bound to unroll loops.
+    UP.Partial = UP.Runtime = UP.UpperBound = true;
     UP.PartialThreshold = MaxOps;
 
     // Avoid unrolling when optimizing for size.
index 511dfed..1a60495 100644 (file)
@@ -169,8 +169,9 @@ Pass *createLoopInstSimplifyPass();
 // LoopUnroll - This pass is a simple loop unrolling pass.
 //
 Pass *createLoopUnrollPass(int Threshold = -1, int Count = -1,
-                           int AllowPartial = -1, int Runtime = -1);
-// Create an unrolling pass for full unrolling only.
+                           int AllowPartial = -1, int Runtime = -1,
+                           int UpperBound = -1);
+// Create an unrolling pass for full unrolling that uses exact trip count only.
 Pass *createSimpleLoopUnrollPass();
 
 //===----------------------------------------------------------------------===//
index b140a93..74a7258 100644 (file)
@@ -21,6 +21,7 @@ struct LoopUnrollPass : public PassInfoMixin<LoopUnrollPass> {
   Optional<unsigned> ProvidedThreshold;
   Optional<bool> ProvidedAllowPartial;
   Optional<bool> ProvidedRuntime;
+  Optional<bool> ProvidedUpperBound;
 
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM);
 };
index 4ccb0e0..74dc2b7 100644 (file)
@@ -32,8 +32,8 @@ class ScalarEvolution;
 
 bool UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
                 bool AllowRuntime, bool AllowExpensiveTripCount,
-                unsigned TripMultiple, LoopInfo *LI, ScalarEvolution *SE,
-                DominatorTree *DT, AssumptionCache *AC,
+                bool UseUpperBound, unsigned TripMultiple, LoopInfo *LI,
+                ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
                 OptimizationRemarkEmitter *ORE, bool PreserveLCSSA);
 
 bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
index 70e0c29..8389e3c 100644 (file)
@@ -5292,6 +5292,20 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
 //                   Iteration Count Computation Code
 //
 
+static unsigned getConstantTripCount(const SCEVConstant *ExitCount) {
+  if (!ExitCount)
+    return 0;
+
+  ConstantInt *ExitConst = ExitCount->getValue();
+
+  // Guard against huge trip counts.
+  if (ExitConst->getValue().getActiveBits() > 32)
+    return 0;
+
+  // In case of integer overflow, this returns 0, which is correct.
+  return ((unsigned)ExitConst->getZExtValue()) + 1;
+}
+
 unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L) {
   if (BasicBlock *ExitingBB = L->getExitingBlock())
     return getSmallConstantTripCount(L, ExitingBB);
@@ -5307,17 +5321,13 @@ unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L,
          "Exiting block must actually branch out of the loop!");
   const SCEVConstant *ExitCount =
       dyn_cast<SCEVConstant>(getExitCount(L, ExitingBlock));
-  if (!ExitCount)
-    return 0;
-
-  ConstantInt *ExitConst = ExitCount->getValue();
-
-  // Guard against huge trip counts.
-  if (ExitConst->getValue().getActiveBits() > 32)
-    return 0;
+  return getConstantTripCount(ExitCount);
+}
 
-  // In case of integer overflow, this returns 0, which is correct.
-  return ((unsigned)ExitConst->getZExtValue()) + 1;
+unsigned ScalarEvolution::getSmallConstantMaxTripCount(Loop *L) {
+  const auto *MaxExitCount =
+      dyn_cast<SCEVConstant>(getMaxBackedgeTakenCount(L));
+  return getConstantTripCount(MaxExitCount);
 }
 
 unsigned ScalarEvolution::getSmallConstantTripMultiple(Loop *L) {
index 5ab61f1..e4b181b 100644 (file)
@@ -92,6 +92,11 @@ static cl::opt<bool>
     UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
                   cl::desc("Unroll loops with run-time trip counts"));
 
+static cl::opt<unsigned> UnrollMaxUpperBound(
+    "unroll-max-upperbound", cl::init(8), cl::Hidden,
+    cl::desc(
+        "The max of trip count upper bound that is considered in unrolling"));
+
 static cl::opt<unsigned> PragmaUnrollThreshold(
     "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
     cl::desc("Unrolled size limit for loops with an unroll(full) or "
@@ -107,7 +112,7 @@ static const unsigned NoThreshold = UINT_MAX;
 static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
     Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold,
     Optional<unsigned> UserCount, Optional<bool> UserAllowPartial,
-    Optional<bool> UserRuntime) {
+    Optional<bool> UserRuntime, Optional<bool> UserUpperBound) {
   TargetTransformInfo::UnrollingPreferences UP;
 
   // Set up the defaults
@@ -126,6 +131,7 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
   UP.AllowRemainder = true;
   UP.AllowExpensiveTripCount = false;
   UP.Force = false;
+  UP.UpperBound = false;
 
   // Override with any target specific settings
   TTI.getUnrollingPreferences(L, UP);
@@ -156,6 +162,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
     UP.AllowRemainder = UnrollAllowRemainder;
   if (UnrollRuntime.getNumOccurrences() > 0)
     UP.Runtime = UnrollRuntime;
+  if (UnrollMaxUpperBound == 0)
+    UP.UpperBound = false;
 
   // Apply user values provided by argument
   if (UserThreshold.hasValue()) {
@@ -168,6 +176,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
     UP.Partial = *UserAllowPartial;
   if (UserRuntime.hasValue())
     UP.Runtime = *UserRuntime;
+  if (UserUpperBound.hasValue())
+    UP.UpperBound = *UserUpperBound;
 
   return UP;
 }
@@ -691,13 +701,11 @@ static bool canUnrollCompletely(Loop *L, unsigned Threshold,
 
 // Returns true if unroll count was set explicitly.
 // Calculates unroll count and writes it to UP.Count.
-static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
-                               DominatorTree &DT, LoopInfo *LI,
-                               ScalarEvolution *SE,
-                               OptimizationRemarkEmitter *ORE,
-                               unsigned TripCount, unsigned TripMultiple,
-                               unsigned LoopSize,
-                               TargetTransformInfo::UnrollingPreferences &UP) {
+static bool computeUnrollCount(
+    Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
+    ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
+    unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize,
+    TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
   // BEInsns represents number of instructions optimized when "back edge"
   // becomes "fall through" in unrolled loop.
   // For now we count a conditional branch on a backedge and a comparison
@@ -749,14 +757,27 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
   }
 
   // 3rd priority is full unroll count.
-  // Full unroll make sense only when TripCount could be staticaly calculated.
+  // Full unroll makes sense only when TripCount or its upper bound could be
+  // statically calculated.
   // Also we need to check if we exceed FullUnrollMaxCount.
-  if (TripCount && TripCount <= UP.FullUnrollMaxCount) {
+  // If using the upper bound to unroll, TripMultiple should be set to 1 because
+  // we do not know when loop may exit.
+  // MaxTripCount and ExactTripCount cannot both be non zero since we only
+  // compute the former when the latter is zero.
+  unsigned ExactTripCount = TripCount;
+  assert((ExactTripCount == 0 || MaxTripCount == 0) &&
+         "ExtractTripCound and MaxTripCount cannot both be non zero.");
+  unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount;
+  if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
     // When computing the unrolled size, note that BEInsns are not replicated
     // like the rest of the loop body.
-    UnrolledSize = (uint64_t)(LoopSize - BEInsns) * TripCount + BEInsns;
+    UnrolledSize =
+        (uint64_t)(LoopSize - BEInsns) * FullUnrollTripCount + BEInsns;
     if (canUnrollCompletely(L, UP.Threshold, 100, UP.DynamicCostSavingsDiscount,
                             UnrolledSize, UnrolledSize)) {
+      UseUpperBound = (MaxTripCount == FullUnrollTripCount);
+      TripCount = FullUnrollTripCount;
+      TripMultiple = UP.UpperBound ? 1 : TripMultiple;
       UP.Count = TripCount;
       return ExplicitUnroll;
     } else {
@@ -764,12 +785,15 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
       // helps to remove a significant number of instructions.
       // To check that, run additional analysis on the loop.
       if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
-              L, TripCount, DT, *SE, TTI,
+              L, FullUnrollTripCount, DT, *SE, TTI,
               UP.Threshold + UP.DynamicCostSavingsDiscount))
         if (canUnrollCompletely(L, UP.Threshold,
                                 UP.PercentDynamicCostSavedThreshold,
                                 UP.DynamicCostSavingsDiscount,
                                 Cost->UnrolledCost, Cost->RolledDynamicCost)) {
+          UseUpperBound = (MaxTripCount == FullUnrollTripCount);
+          TripCount = FullUnrollTripCount;
+          TripMultiple = UP.UpperBound ? 1 : TripMultiple;
           UP.Count = TripCount;
           return ExplicitUnroll;
         }
@@ -909,7 +933,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
                             Optional<unsigned> ProvidedCount,
                             Optional<unsigned> ProvidedThreshold,
                             Optional<bool> ProvidedAllowPartial,
-                            Optional<bool> ProvidedRuntime) {
+                            Optional<bool> ProvidedRuntime,
+                            Optional<bool> ProvidedUpperBound) {
   DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName()
                << "] Loop %" << L->getHeader()->getName() << "\n");
   if (HasUnrollDisablePragma(L)) {
@@ -939,6 +964,7 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
 
   // Find trip count and trip multiple if count is not available
   unsigned TripCount = 0;
+  unsigned MaxTripCount = 0;
   unsigned TripMultiple = 1;
   // If there are multiple exiting blocks but one of them is the latch, use the
   // latch for the trip count estimation. Otherwise insist on a single exiting
@@ -953,7 +979,7 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
 
   TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
       L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
-      ProvidedRuntime);
+      ProvidedRuntime, ProvidedUpperBound);
 
   // Exit early if unrolling is disabled.
   if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
@@ -974,8 +1000,23 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   if (Convergent)
     UP.AllowRemainder = false;
 
-  bool IsCountSetExplicitly = computeUnrollCount(
-      L, TTI, DT, LI, SE, &ORE, TripCount, TripMultiple, LoopSize, UP);
+  // Try to find the trip count upper bound if it is allowed and we cannot find
+  // exact trip count.
+  if (UP.UpperBound) {
+    if (!TripCount) {
+      MaxTripCount = SE->getSmallConstantMaxTripCount(L);
+      // Only unroll with small upper bound.
+      if (MaxTripCount > UnrollMaxUpperBound)
+        MaxTripCount = 0;
+    }
+  }
+
+  // computeUnrollCount() decides whether it is beneficial to use upper bound to
+  // fully unroll the loop.
+  bool UseUpperBound = false;
+  bool IsCountSetExplicitly =
+      computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount,
+                         TripMultiple, LoopSize, UP, UseUpperBound);
   if (!UP.Count)
     return false;
   // Unroll factor (Count) must be less or equal to TripCount.
@@ -984,8 +1025,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
 
   // Unroll the loop.
   if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime,
-                  UP.AllowExpensiveTripCount, TripMultiple, LI, SE, &DT, &AC,
-                  &ORE, PreserveLCSSA))
+                  UP.AllowExpensiveTripCount, UseUpperBound, TripMultiple, LI,
+                  SE, &DT, &AC, &ORE, PreserveLCSSA))
     return false;
 
   // If loop has an unroll count pragma or unrolled by explicitly set count
@@ -1001,10 +1042,11 @@ public:
   static char ID; // Pass ID, replacement for typeid
   LoopUnroll(Optional<unsigned> Threshold = None,
              Optional<unsigned> Count = None,
-             Optional<bool> AllowPartial = None, Optional<bool> Runtime = None)
+             Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
+             Optional<bool> UpperBound = None)
       : LoopPass(ID), ProvidedCount(std::move(Count)),
         ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
-        ProvidedRuntime(Runtime) {
+        ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound) {
     initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
   }
 
@@ -1012,6 +1054,7 @@ public:
   Optional<unsigned> ProvidedThreshold;
   Optional<bool> ProvidedAllowPartial;
   Optional<bool> ProvidedRuntime;
+  Optional<bool> ProvidedUpperBound;
 
   bool runOnLoop(Loop *L, LPPassManager &) override {
     if (skipLoop(L))
@@ -1033,7 +1076,8 @@ public:
 
     return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA,
                            ProvidedCount, ProvidedThreshold,
-                           ProvidedAllowPartial, ProvidedRuntime);
+                           ProvidedAllowPartial, ProvidedRuntime,
+                           ProvidedUpperBound);
   }
 
   /// This transformation requires natural loop information & requires that
@@ -1057,7 +1101,7 @@ INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 
 Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
-                                 int Runtime) {
+                                 int Runtime, int UpperBound) {
   // TODO: It would make more sense for this function to take the optionals
   // directly, but that's dangerous since it would silently break out of tree
   // callers.
@@ -1065,11 +1109,12 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
                         Count == -1 ? None : Optional<unsigned>(Count),
                         AllowPartial == -1 ? None
                                            : Optional<bool>(AllowPartial),
-                        Runtime == -1 ? None : Optional<bool>(Runtime));
+                        Runtime == -1 ? None : Optional<bool>(Runtime),
+                        UpperBound == -1 ? None : Optional<bool>(UpperBound));
 }
 
 Pass *llvm::createSimpleLoopUnrollPass() {
-  return llvm::createLoopUnrollPass(-1, -1, 0, 0);
+  return llvm::createLoopUnrollPass(-1, -1, 0, 0, 0);
 }
 
 PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM) {
@@ -1098,9 +1143,10 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM) {
     report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not "
                        "cached at a higher level");
 
-  bool Changed = tryToUnrollLoop(
-      &L, *DT, LI, SE, *TTI, *AC, *ORE, /*PreserveLCSSA*/ true, ProvidedCount,
-      ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime);
+  bool Changed =
+      tryToUnrollLoop(&L, *DT, LI, SE, *TTI, *AC, *ORE, /*PreserveLCSSA*/ true,
+                      ProvidedCount, ProvidedThreshold, ProvidedAllowPartial,
+                      ProvidedRuntime, ProvidedUpperBound);
 
   if (!Changed)
     return PreservedAnalyses::all();
index 77b744e..8472247 100644 (file)
@@ -187,6 +187,10 @@ static bool needToInsertPhisForLCSSA(Loop *L, std::vector<BasicBlock *> Blocks,
 /// iterations via an early branch, but control may not exit the loop from the
 /// LatchBlock's terminator prior to TripCount iterations.
 ///
+/// PreserveCondBr indicates whether the conditional branch of the LatchBlock
+/// needs to be preserved.  It is needed when we use trip count upper bound to
+/// fully unroll the loop.
+///
 /// Similarly, TripMultiple divides the number of times that the LatchBlock may
 /// execute without exiting the loop.
 ///
@@ -203,9 +207,10 @@ static bool needToInsertPhisForLCSSA(Loop *L, std::vector<BasicBlock *> Blocks,
 /// DominatorTree if they are non-null.
 bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
                       bool AllowRuntime, bool AllowExpensiveTripCount,
-                      unsigned TripMultiple, LoopInfo *LI, ScalarEvolution *SE,
-                      DominatorTree *DT, AssumptionCache *AC,
-                      OptimizationRemarkEmitter *ORE, bool PreserveLCSSA) {
+                      bool PreserveCondBr, unsigned TripMultiple, LoopInfo *LI,
+                      ScalarEvolution *SE, DominatorTree *DT,
+                      AssumptionCache *AC, OptimizationRemarkEmitter *ORE,
+                      bool PreserveLCSSA) {
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) {
     DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
@@ -539,12 +544,16 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
     if (CompletelyUnroll) {
       if (j == 0)
         Dest = LoopExit;
-      NeedConditional = false;
-    }
-
-    // If we know the trip count or a multiple of it, we can safely use an
-    // unconditional branch for some iterations.
-    if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) {
+      // If using trip count upper bound to completely unroll, we need to keep
+      // the conditional branch except the last one because the loop may exit
+      // after any iteration.
+      assert(NeedConditional &&
+             "NeedCondition cannot be modified by both complete "
+             "unrolling and runtime unrolling");
+      NeedConditional = (PreserveCondBr && j);
+    } else if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) {
+      // If we know the trip count or a multiple of it, we can safely use an
+      // unconditional branch for some iterations.
       NeedConditional = false;
     }
 
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/full-unroll-trip-count-upper-bound.ll b/llvm/test/Transforms/LoopUnroll/AArch64/full-unroll-trip-count-upper-bound.ll
new file mode 100644 (file)
index 0000000..5c70a26
--- /dev/null
@@ -0,0 +1,43 @@
+; RUN: opt -loop-unroll -S -mtriple aarch64 -mcpu=cortex-a57 %s | FileCheck %s -check-prefix=UNROLL
+; RUN: opt -loop-unroll -unroll-max-upperbound=0 -S -mtriple aarch64 -mcpu=cortex-a57 %s | FileCheck %s -check-prefix=NOUNROLL
+
+; This IR comes from this C code:
+;
+;   for (int i = 0; i < 4; i++) {
+;     if (src[i] == 1) {
+;       *dst = i;
+;       break;
+;     }
+;   }
+;
+; This test is meant to check that this loop is unrolled into four iterations.
+
+; UNROLL-LABEL: @test
+; UNROLL: load i32, i32*
+; UNROLL: load i32, i32*
+; UNROLL: load i32, i32*
+; UNROLL: load i32, i32*
+; UNROLL-NOT: load i32, i32*
+; NOUNROLL-LABEL: @test
+; NOUNROLL: load i32, i32*
+; NOUNROLL-NOT: load i32, i32*
+
+define void @test(i32* %dst, i32* %src) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = sext i32 %i to i64
+  %1 = getelementptr inbounds i32, i32* %src, i64 %0
+  %2 = load i32, i32* %1
+  %inc = add nsw i32 %i, 1
+  %cmp1 = icmp slt i32 %inc, 4
+  %cmp3 = icmp eq i32 %2, 1 
+  %or.cond = and i1 %cmp3, %cmp1
+  br i1 %or.cond, label %for.body, label %exit
+
+exit:                                          ; preds = %for.body
+  store i32 %i, i32* %dst
+  ret void
+}