From 8bbabee21aaf9db23734a55b6b7a751f297ea9c2 Mon Sep 17 00:00:00 2001
From: David L Kreitzer <david.l.kreitzer@intel.com>
Date: Fri, 16 Sep 2016 14:38:13 +0000
Subject: [PATCH] Reapplying r278731 after fixing the problem that caused it to
 be reverted.

Enhance SCEV to compute the trip count for some loops with unknown stride.

Patch by Pankaj Chawla

Differential Revision: https://reviews.llvm.org/D22377

llvm-svn: 281732
---
 llvm/include/llvm/Analysis/ScalarEvolution.h       |   7 ++
 llvm/lib/Analysis/ScalarEvolution.cpp              | 115 ++++++++++++++++++---
 .../ScalarEvolution/trip-count-unknown-stride.ll   |  62 +++++++++++
 3 files changed, 169 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 6321165..a97c6b7 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -803,6 +803,13 @@ namespace llvm {
     /// Cache for \c loopHasNoAbnormalExits.
     DenseMap<const Loop *, bool> LoopHasNoAbnormalExits;
 
+    /// Cache for \c loopHasNoSideEffects.
+    DenseMap<const Loop *, bool> LoopHasNoSideEffects;
+
+    /// Returns true if \p L contains no instruction that can have side effects
+    /// (i.e. via throwing an exception, volatile or atomic access).
+    bool loopHasNoSideEffects(const Loop *L);
+
     /// Returns true if \p L contains no instruction that can abnormally exit
     /// the loop (i.e. via throwing an exception, by terminating the thread
     /// cleanly or by infinite looping in a called function).  Strictly
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 3a922229..e97e668 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -4953,6 +4953,28 @@ bool ScalarEvolution::isAddRecNeverPoison(const Instruction *I, const Loop *L) {
   return LatchControlDependentOnPoison && loopHasNoAbnormalExits(L);
 }
 
+bool ScalarEvolution::loopHasNoSideEffects(const Loop *L) {
+  auto Itr = LoopHasNoSideEffects.find(L);
+  if (Itr == LoopHasNoSideEffects.end()) {
+    auto NoSideEffectsInBB = [&](BasicBlock *BB) {
+      return all_of(*BB, [](Instruction &I) {
+        // Non-atomic, non-volatile stores are ok.
+        if (auto *SI = dyn_cast<StoreInst>(&I))
+          return SI->isSimple();
+
+        return !I.mayHaveSideEffects();
+      });
+    };
+
+    auto InsertPair = LoopHasNoSideEffects.insert(
+        {L, all_of(L->getBlocks(), NoSideEffectsInBB)});
+    assert(InsertPair.second && "We just checked!");
+    Itr = InsertPair.first;
+  }
+
+  return Itr->second;
+}
+
 bool ScalarEvolution::loopHasNoAbnormalExits(const Loop *L) {
   auto Itr = LoopHasNoAbnormalExits.find(L);
   if (Itr == LoopHasNoAbnormalExits.end()) {
@@ -5540,6 +5562,7 @@ void ScalarEvolution::forgetLoop(const Loop *L) {
     forgetLoop(I);
 
   LoopHasNoAbnormalExits.erase(L);
+  LoopHasNoSideEffects.erase(L);
 }
 
 void ScalarEvolution::forgetValue(Value *V) {
@@ -8614,6 +8637,8 @@ bool ScalarEvolution::isImpliedCondOperandsViaRanges(ICmpInst::Predicate Pred,
 
 bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
                                          bool IsSigned, bool NoWrap) {
+  assert(isKnownPositive(Stride) && "Positive stride expected!");
+
   if (NoWrap) return false;
 
   unsigned BitWidth = getTypeSizeInBits(RHS->getType());
@@ -8682,11 +8707,15 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
     return getCouldNotCompute();
 
   const SCEVAddRecExpr *IV = dyn_cast<SCEVAddRecExpr>(LHS);
-  if (!IV && AllowPredicates)
+  bool PredicatedIV = false;
+
+  if (!IV && AllowPredicates) {
     // Try to make this an AddRec using runtime tests, in the first X
     // iterations of this loop, where X is the SCEV expression found by the
     // algorithm below.
     IV = convertSCEVToAddRecWithPredicates(LHS, L, P);
+    PredicatedIV = true;
+  }
 
   // Avoid weird loops
   if (!IV || IV->getLoop() != L || !IV->isAffine())
@@ -8697,15 +8726,62 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
 
   const SCEV *Stride = IV->getStepRecurrence(*this);
 
-  // Avoid negative or zero stride values
-  if (!isKnownPositive(Stride))
-    return getCouldNotCompute();
+  bool PositiveStride = isKnownPositive(Stride);
 
-  // Avoid proven overflow cases: this will ensure that the backedge taken count
-  // will not generate any unsigned overflow. Relaxed no-overflow conditions
-  // exploit NoWrapFlags, allowing to optimize in presence of undefined
-  // behaviors like the case of C language.
-  if (!Stride->isOne() && doesIVOverflowOnLT(RHS, Stride, IsSigned, NoWrap))
+  // Avoid negative or zero stride values.
+  if (!PositiveStride) {
+    // We can compute the correct backedge taken count for loops with unknown
+    // strides if we can prove that the loop is not an infinite loop with side
+    // effects. Here's the loop structure we are trying to handle -
+    //
+    // i = start
+    // do {
+    //   A[i] = i;
+    //   i += s;
+    // } while (i < end);
+    //
+    // The backedge taken count for such loops is evaluated as -
+    // (max(end, start + stride) - start - 1) /u stride
+    //
+    // The additional preconditions that we need to check to prove correctness
+    // of the above formula is as follows -
+    //
+    // a) IV is either nuw or nsw depending upon signedness (indicated by the
+    //    NoWrap flag).
+    // b) loop is single exit with no side effects.
+    //
+    //
+    // Precondition a) implies that if the stride is negative, this is a single
+    // trip loop. The backedge taken count formula reduces to zero in this case.
+    //
+    // Precondition b) implies that the unknown stride cannot be zero otherwise
+    // we have UB.
+    //
+    // The positive stride case is the same as isKnownPositive(Stride) returning
+    // true (original behavior of the function).
+    //
+    // We want to make sure that the stride is truly unknown as there are edge
+    // cases where ScalarEvolution propagates no wrap flags to the
+    // post-increment/decrement IV even though the increment/decrement operation
+    // itself is wrapping. The computed backedge taken count may be wrong in
+    // such cases. This is prevented by checking that the stride is not known to
+    // be either positive or non-positive. For example, no wrap flags are
+    // propagated to the post-increment IV of this loop with a trip count of 2 -
+    //
+    // unsigned char i;
+    // for(i=127; i<128; i+=129)
+    //   A[i] = i;
+    //
+    if (PredicatedIV || !NoWrap || isKnownNonPositive(Stride) ||
+        !loopHasNoSideEffects(L))
+      return getCouldNotCompute();
+
+  } else if (!Stride->isOne() &&
+             doesIVOverflowOnLT(RHS, Stride, IsSigned, NoWrap))
+    // Avoid proven overflow cases: this will ensure that the backedge taken
+    // count will not generate any unsigned overflow. Relaxed no-overflow
+    // conditions exploit NoWrapFlags, allowing to optimize in presence of
+    // undefined behaviors like the case of C language.
     return getCouldNotCompute();
 
   ICmpInst::Predicate Cond = IsSigned ? ICmpInst::ICMP_SLT
@@ -8720,12 +8796,21 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
   APInt MinStart = IsSigned ? getSignedRange(Start).getSignedMin()
                             : getUnsignedRange(Start).getUnsignedMin();
 
-  APInt MinStride = IsSigned ? getSignedRange(Stride).getSignedMin()
-                             : getUnsignedRange(Stride).getUnsignedMin();
-
   unsigned BitWidth = getTypeSizeInBits(LHS->getType());
-  APInt Limit = IsSigned ? APInt::getSignedMaxValue(BitWidth) - (MinStride - 1)
-                         : APInt::getMaxValue(BitWidth) - (MinStride - 1);
+
+  APInt StrideForMaxBECount;
+
+  if (PositiveStride)
+    StrideForMaxBECount = IsSigned ? getSignedRange(Stride).getSignedMin()
+                                   : getUnsignedRange(Stride).getUnsignedMin();
+  else
+    // Using a stride of 1 is safe when computing max backedge taken count for
+    // a loop with unknown stride.
+    StrideForMaxBECount = APInt(BitWidth, 1, IsSigned);
+
+  APInt Limit =
+      IsSigned ? APInt::getSignedMaxValue(BitWidth) - (StrideForMaxBECount - 1)
+               : APInt::getMaxValue(BitWidth) - (StrideForMaxBECount - 1);
 
   // Although End can be a MAX expression we estimate MaxEnd considering only
   // the case End = RHS. This is safe because in the other case (End - Start)
@@ -8739,7 +8824,7 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
     MaxBECount = BECount;
   else
     MaxBECount = computeBECount(getConstant(MaxEnd - MinStart),
-                                getConstant(MinStride), false);
+                                getConstant(StrideForMaxBECount), false);
 
   if (isa<SCEVCouldNotCompute>(MaxBECount))
     MaxBECount = BECount;
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll
new file mode 100644
index 0000000..60370d6
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll
@@ -0,0 +1,62 @@
+; RUN: opt < %s -analyze -scalar-evolution | FileCheck %s
+
+; ScalarEvolution should be able to compute trip count of the loop by proving
+; that this is not an infinite loop with side effects.
+
+; CHECK: Determining loop execution counts for: @foo1
+; CHECK: backedge-taken count is ((-1 + %n) /u %s)
+
+; We should have a conservative estimate for the max backedge taken count for
+; loops with unknown stride.
+; CHECK: max backedge-taken count is -1
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+
+; Function Attrs: norecurse nounwind
+define void @foo1(i32* nocapture %A, i32 %n, i32 %s) #0 {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.05
+  %0 = load i32, i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %add = add nsw i32 %i.05, %s
+  %cmp = icmp slt i32 %add, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+; Check that we are able to compute trip count of a loop without an entry guard.
+; CHECK: Determining loop execution counts for: @foo2
+; CHECK: backedge-taken count is ((-1 + (%n smax %s)) /u %s)
+
+; We should have a conservative estimate for the max backedge taken count for
+; loops with unknown stride.
+; CHECK: max backedge-taken count is -1
+
+; Function Attrs: norecurse nounwind
+define void @foo2(i32* nocapture %A, i32 %n, i32 %s) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.05
+  %0 = load i32, i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %add = add nsw i32 %i.05, %s
+  %cmp = icmp slt i32 %add, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
-- 
2.7.4