From 3622fbfc6878952eed3545fb60926e704729f0d1 Mon Sep 17 00:00:00 2001
From: Elena Demikhovsky <elena.demikhovsky@intel.com>
Date: Sun, 28 Aug 2016 08:53:53 +0000
Subject: [PATCH] [Loop Vectorizer] Fixed memory confilict checks.

Fixed a bug in run-time checks for possible memory conflicts inside loop.
The bug is in Low <-> High boundaries calculation. The High boundary should be calculated as "last memory access pointer + element size".

Differential revision: https://reviews.llvm.org/D23176

llvm-svn: 279930
---
 llvm/include/llvm/Analysis/LoopAccessAnalysis.h    |  6 ++-
 llvm/lib/Analysis/LoopAccessAnalysis.cpp           | 32 ++++++++++++--
 .../memcheck-off-by-one-error.ll                   | 51 ++++++++++++++++++++++
 .../LoopAccessAnalysis/number-of-memchecks.ll      | 18 ++++----
 .../LoopAccessAnalysis/reverse-memcheck-bounds.ll  |  4 +-
 .../LoopVectorize/runtime-check-readonly.ll        |  8 ++--
 llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll   | 14 +++---
 .../LoopVersioningLICM/loopversioningLICM1.ll      |  6 +--
 8 files changed, 109 insertions(+), 30 deletions(-)
 create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/memcheck-off-by-one-error.ll
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index a9674fb..b427478 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -335,9 +335,11 @@ public:
   struct PointerInfo {
     /// Holds the pointer value that we need to check.
     TrackingVH<Value> PointerValue;
-    /// Holds the pointer value at the beginning of the loop.
+    /// Holds the smallest byte address accessed by the pointer throughout all
+    /// iterations of the loop.
     const SCEV *Start;
-    /// Holds the pointer value at the end of the loop.
+    /// Holds the largest byte address accessed by the pointer throughout all
+    /// iterations of the loop, plus 1.
     const SCEV *End;
     /// Holds the information if this pointer is used for writing to memory.
     bool IsWritePtr;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index cb0388d..8c761b4 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -149,6 +149,19 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
   return OrigSCEV;
 }
 
+/// Calculate Start and End points of memory access.
+/// Let's assume A is the first access and B is a memory access on N-th loop
+/// iteration. Then B is calculated as:  
+///   B = A + Step*N . 
+/// Step value may be positive or negative.
+/// N is a calculated back-edge taken count:
+///     N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
+/// Start and End points are calculated in the following way:
+/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
+/// where SizeOfElt is the size of single memory access in bytes.
+///
+/// There is no conflict when the intervals are disjoint:
+/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
 void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
                                     unsigned DepSetId, unsigned ASId,
                                     const ValueToValueMap &Strides,
@@ -177,12 +190,17 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
       if (CStep->getValue()->isNegative())
         std::swap(ScStart, ScEnd);
     } else {
-      // Fallback case: the step is not constant, but the we can still
+      // Fallback case: the step is not constant, but we can still
       // get the upper and lower bounds of the interval by using min/max
       // expressions.
       ScStart = SE->getUMinExpr(ScStart, ScEnd);
       ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
     }
+    // Add the size of the pointed element to ScEnd.
+    unsigned EltSize =
+      Ptr->getType()->getPointerElementType()->getScalarSizeInBits() / 8;
+    const SCEV *EltSizeSCEV = SE->getConstant(ScEnd->getType(), EltSize);
+    ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
   }
 
   Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc);
@@ -1870,9 +1888,17 @@ std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeChecks(
     Value *End0 =   ChkBuilder.CreateBitCast(A.End,   PtrArithTy1, "bc");
     Value *End1 =   ChkBuilder.CreateBitCast(B.End,   PtrArithTy0, "bc");
 
-    Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
+    // [A|B].Start points to the first accessed byte under base [A|B].
+    // [A|B].End points to the last accessed byte, plus one.
+    // There is no conflict when the intervals are disjoint:
+    // NoConflict = (B.Start >= A.End) || (A.Start >= B.End)
+    //
+    // bound0 = (B.Start < A.End)
+    // bound1 = (A.Start < B.End)
+    //  IsConflict = bound0 & bound1
+    Value *Cmp0 = ChkBuilder.CreateICmpULT(Start0, End1, "bound0");
     FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
-    Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
+    Value *Cmp1 = ChkBuilder.CreateICmpULT(Start1, End0, "bound1");
     FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
     Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
     FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/memcheck-off-by-one-error.ll b/llvm/test/Analysis/LoopAccessAnalysis/memcheck-off-by-one-error.ll
new file mode 100644
index 0000000..01813c8
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/memcheck-off-by-one-error.ll
@@ -0,0 +1,51 @@
+; RUN: opt -analyze --loop-accesses %s | FileCheck %s
+
+; This test verifies run-time boundary check of memory accesses.
+; The original loop:
+;   void fastCopy(const char* src, char* op) {
+;     int len = 32;
+;     while (len > 0) {
+;       *(reinterpret_cast<long long*>(op)) = *(reinterpret_cast<const long long*>(src));
+;       src += 8;
+;       op += 8;
+;       len -= 8;
+;     }
+;   }
+; Boundaries calculations before this patch:
+; (Low: %src High: (24 + %src))
+; and the actual distance between two pointers was 31,  (%op - %src = 31)
+; IsConflict = (24 > 31) = false -> execution is directed to the vectorized loop.
+; The loop was vectorized to 4, 32 byte memory access ( <4 x i64> ),
+; store a value at *%op touched memory under *%src.
+
+;CHECK: Printing analysis 'Loop Access Analysis' for function 'fastCopy'
+;CHECK: (Low: %op High: (32 + %op))
+;CHECK: (Low: %src High: (32 + %src))
+
+define void @fastCopy(i8* nocapture readonly %src, i8* nocapture %op) {
+entry:
+  br label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %len.addr.07 = phi i32 [ %sub, %while.body ], [ 32, %while.body.preheader ]
+  %op.addr.06 = phi i8* [ %add.ptr1, %while.body ], [ %op, %while.body.preheader ]
+  %src.addr.05 = phi i8* [ %add.ptr, %while.body ], [ %src, %while.body.preheader ]
+  %0 = bitcast i8* %src.addr.05 to i64*
+  %1 = load i64, i64* %0, align 8
+  %2 = bitcast i8* %op.addr.06 to i64*
+  store i64 %1, i64* %2, align 8
+  %add.ptr = getelementptr inbounds i8, i8* %src.addr.05, i64 8
+  %add.ptr1 = getelementptr inbounds i8, i8* %op.addr.06, i64 8
+  %sub = add nsw i32 %len.addr.07, -8
+  %cmp = icmp sgt i32 %len.addr.07, 8
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  ret void
+}
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll b/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll
index a9626f4d..2bae486 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll
@@ -96,15 +96,15 @@ for.end:                                          ; preds = %for.body
 ; CHECK-NEXT:       %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %ind
 ; CHECK-NEXT:   Grouped accesses:
 ; CHECK-NEXT:    Group {{.*}}[[ZERO]]:
-; CHECK-NEXT:       (Low: %c High: (78 + %c))
+; CHECK-NEXT:       (Low: %c High: (80 + %c))
 ; CHECK-NEXT:         Member: {(2 + %c)<nsw>,+,4}
 ; CHECK-NEXT:         Member: {%c,+,4}
 ; CHECK-NEXT:     Group {{.*}}[[ONE]]:
-; CHECK-NEXT:       (Low: %a High: (40 + %a))
+; CHECK-NEXT:       (Low: %a High: (42 + %a))
 ; CHECK-NEXT:         Member: {(2 + %a)<nsw>,+,2}
 ; CHECK-NEXT:         Member: {%a,+,2}
 ; CHECK-NEXT:     Group {{.*}}[[TWO]]:
-; CHECK-NEXT:       (Low: %b High: (38 + %b))
+; CHECK-NEXT:       (Low: %b High: (40 + %b))
 ; CHECK-NEXT:         Member: {%b,+,2}
 
 define void @testg(i16* %a,
@@ -168,15 +168,15 @@ for.end:                                          ; preds = %for.body
 ; CHECK-NEXT:         %arrayidxB = getelementptr i16, i16* %b, i64 %ind
 ; CHECK-NEXT:   Grouped accesses:
 ; CHECK-NEXT:     Group {{.*}}[[ZERO]]:
-; CHECK-NEXT:       (Low: %c High: (78 + %c))
+; CHECK-NEXT:       (Low: %c High: (80 + %c))
 ; CHECK-NEXT:         Member: {(2 + %c)<nsw>,+,4}
 ; CHECK-NEXT:         Member: {%c,+,4}
 ; CHECK-NEXT:     Group {{.*}}[[ONE]]:
-; CHECK-NEXT:       (Low: %a High: (40 + %a))
+; CHECK-NEXT:       (Low: %a High: (42 + %a))
 ; CHECK-NEXT:         Member: {(2 + %a),+,2}
 ; CHECK-NEXT:         Member: {%a,+,2}
 ; CHECK-NEXT:     Group {{.*}}[[TWO]]:
-; CHECK-NEXT:       (Low: %b High: (38 + %b))
+; CHECK-NEXT:       (Low: %b High: (40 + %b))
 ; CHECK-NEXT:         Member: {%b,+,2}
 
 define void @testh(i16* %a,
@@ -247,13 +247,13 @@ for.end:                                          ; preds = %for.body
 ; CHECK-NEXT:       %arrayidxA2 = getelementptr i16, i16* %a, i64 %ind2
 ; CHECK-NEXT:   Grouped accesses:
 ; CHECK-NEXT:     Group {{.*}}[[ZERO]]:
-; CHECK-NEXT:       (Low: ((2 * %offset) + %a)<nsw> High: (9998 + (2 * %offset) + %a))
+; CHECK-NEXT:       (Low: ((2 * %offset) + %a)<nsw> High: (10000 + (2 * %offset) + %a))
 ; CHECK-NEXT:         Member: {((2 * %offset) + %a)<nsw>,+,2}<nsw><%for.body>
 ; CHECK-NEXT:     Group {{.*}}[[ONE]]:
-; CHECK-NEXT:       (Low: %a High: (9998 + %a))
+; CHECK-NEXT:       (Low: %a High: (10000 + %a))
 ; CHECK-NEXT:         Member: {%a,+,2}<%for.body>
 ; CHECK-NEXT:     Group {{.*}}[[TWO]]:
-; CHECK-NEXT:       (Low: (20000 + %a) High: (29998 + %a))
+; CHECK-NEXT:       (Low: (20000 + %a) High: (30000 + %a))
 ; CHECK-NEXT:         Member: {(20000 + %a),+,2}<%for.body>
 
 define void @testi(i16* %a,
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll b/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll
index 607e007..405a475 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll
@@ -16,7 +16,7 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnueabi"
 
 ; CHECK: function 'f':
-; CHECK: (Low: (20000 + %a) High: (60000 + %a)<nsw>)
+; CHECK: (Low: (20000 + %a) High: (60004 + %a))
 
 @B = common global i32* null, align 8
 @A = common global i32* null, align 8
@@ -59,7 +59,7 @@ for.end:                                          ; preds = %for.body
 ; Here it is not obvious what the limits are, since 'step' could be negative.
 
 ; CHECK: Low: (-1 + (-1 * ((-60001 + (-1 * %a)) umax (-60001 + (40000 * %step) + (-1 * %a)))))
-; CHECK: High: ((60000 + %a)<nsw> umax (60000 + (-40000 * %step) + %a))
+; CHECK: High: (4 + ((60000 + %a)<nsw> umax (60000 + (-40000 * %step) + %a)))
 
 define void @g(i64 %step) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll
index a3b5a59..e91d5b8 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll
@@ -8,10 +8,10 @@ target triple = "x86_64-apple-macosx10.8.0"
 ;CHECK: br
 ;CHECK: getelementptr
 ;CHECK-DAG: getelementptr
-;CHECK-DAG: icmp uge
-;CHECK-DAG: icmp uge
-;CHECK-DAG: icmp uge
-;CHECK-DAG: icmp uge
+;CHECK-DAG: icmp ugt
+;CHECK-DAG: icmp ugt
+;CHECK-DAG: icmp ugt
+;CHECK-DAG: icmp ugt
 ;CHECK-DAG: and
 ;CHECK-DAG: and
 ;CHECK: br
diff --git a/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll b/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll
index 06d0002..3e79d47 100644
--- a/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll
+++ b/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll
@@ -36,7 +36,7 @@ for.end:                                          ; preds = %for.body
 ; CHECK: ret i32 0
 
 ; CHECK-NOTBAA-LABEL: @test1
-; CHECK-NOTBAA: icmp uge i32*
+; CHECK-NOTBAA: icmp ugt i32*
 
 ; CHECK-NOTBAA: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
 ; CHECK-NOTBAA: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa
@@ -70,8 +70,8 @@ for.end:                                          ; preds = %for.body
 ; required. Without TBAA, however, two checks are required.
 
 ; CHECK-LABEL: @test2
-; CHECK: icmp uge float*
-; CHECK: icmp uge float*
+; CHECK: icmp ugt float*
+; CHECK: icmp ugt float*
 ; CHECK-NOT: icmp uge i32*
 
 ; CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
@@ -80,10 +80,10 @@ for.end:                                          ; preds = %for.body
 ; CHECK: ret i32 0
 
 ; CHECK-NOTBAA-LABEL: @test2
-; CHECK-NOTBAA: icmp uge float*
-; CHECK-NOTBAA: icmp uge float*
-; CHECK-NOTBAA-DAG: icmp uge float*
-; CHECK-NOTBAA-DAG: icmp uge i32*
+; CHECK-NOTBAA: icmp ugt float*
+; CHECK-NOTBAA: icmp ugt float*
+; CHECK-NOTBAA-DAG: icmp ugt float*
+; CHECK-NOTBAA-DAG: icmp ugt i32*
 
 ; CHECK-NOTBAA: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
 ; CHECK-NOTBAA: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 4, !tbaa
diff --git a/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll b/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll
index 1c65eed..ff6c250 100644
--- a/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll
+++ b/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll
@@ -8,15 +8,15 @@
 ; CHECK-NEXT:   Loop Versioning found to be beneficial
 ;
 ; CHECK: for.body3:
-; CHECK-NEXT: %add86 = phi i32 [ %arrayidx7.promoted, %for.body3.ph ], [ %add8, %for.body3 ]
+; CHECK-NEXT: %[[induction:.*]] = phi i32 [ %arrayidx7.promoted, %for.body3.ph ], [ %add8, %for.body3 ]
 ; CHECK-NEXT: %j.113 = phi i32 [ %j.016, %for.body3.ph ], [ %inc, %for.body3 ]
 ; CHECK-NEXT: %idxprom = zext i32 %j.113 to i64
 ; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, i32* %var1, i64 %idxprom
 ; CHECK-NEXT: store i32 %add, i32* %arrayidx, align 4, !alias.scope !2, !noalias !2
-; CHECK-NEXT: %add8 = add nsw i32 %add86, %add
+; CHECK-NEXT: %add8 = add nsw i32 %[[induction]], %add
 ; CHECK-NEXT: %inc = add nuw i32 %j.113, 1
 ; CHECK-NEXT: %cmp2 = icmp ult i32 %inc, %itr
-; CHECK-NEXT: br i1 %cmp2, label %for.body3, label %for.inc11.loopexit.loopexit5, !llvm.loop !5
+; CHECK-NEXT: br i1 %cmp2, label %for.body3, label %for.inc11.loopexit.loopexit6, !llvm.loop !5
 define i32 @foo(i32* nocapture %var1, i32* nocapture readnone %var2, i32* nocapture %var3, i32 %itr) #0 {
 entry:
   %cmp14 = icmp eq i32 %itr, 0
-- 
2.7.4