From 3622fbfc6878952eed3545fb60926e704729f0d1 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Sun, 28 Aug 2016 08:53:53 +0000 Subject: [PATCH] [Loop Vectorizer] Fixed memory confilict checks. Fixed a bug in run-time checks for possible memory conflicts inside loop. The bug is in Low <-> High boundaries calculation. The High boundary should be calculated as "last memory access pointer + element size". Differential revision: https://reviews.llvm.org/D23176 llvm-svn: 279930 --- llvm/include/llvm/Analysis/LoopAccessAnalysis.h | 6 ++- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 32 ++++++++++++-- .../memcheck-off-by-one-error.ll | 51 ++++++++++++++++++++++ .../LoopAccessAnalysis/number-of-memchecks.ll | 18 ++++---- .../LoopAccessAnalysis/reverse-memcheck-bounds.ll | 4 +- .../LoopVectorize/runtime-check-readonly.ll | 8 ++-- llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll | 14 +++--- .../LoopVersioningLICM/loopversioningLICM1.ll | 6 +-- 8 files changed, 109 insertions(+), 30 deletions(-) create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/memcheck-off-by-one-error.ll diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index a9674fb..b427478 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -335,9 +335,11 @@ public: struct PointerInfo { /// Holds the pointer value that we need to check. TrackingVH PointerValue; - /// Holds the pointer value at the beginning of the loop. + /// Holds the smallest byte address accessed by the pointer throughout all + /// iterations of the loop. const SCEV *Start; - /// Holds the pointer value at the end of the loop. + /// Holds the largest byte address accessed by the pointer throughout all + /// iterations of the loop, plus 1. const SCEV *End; /// Holds the information if this pointer is used for writing to memory. bool IsWritePtr; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index cb0388d..8c761b4 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -149,6 +149,19 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, return OrigSCEV; } +/// Calculate Start and End points of memory access. +/// Let's assume A is the first access and B is a memory access on N-th loop +/// iteration. Then B is calculated as: +/// B = A + Step*N . +/// Step value may be positive or negative. +/// N is a calculated back-edge taken count: +/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0 +/// Start and End points are calculated in the following way: +/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt, +/// where SizeOfElt is the size of single memory access in bytes. +/// +/// There is no conflict when the intervals are disjoint: +/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End) void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, unsigned ASId, const ValueToValueMap &Strides, @@ -177,12 +190,17 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr, if (CStep->getValue()->isNegative()) std::swap(ScStart, ScEnd); } else { - // Fallback case: the step is not constant, but the we can still + // Fallback case: the step is not constant, but we can still // get the upper and lower bounds of the interval by using min/max // expressions. ScStart = SE->getUMinExpr(ScStart, ScEnd); ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd); } + // Add the size of the pointed element to ScEnd. + unsigned EltSize = + Ptr->getType()->getPointerElementType()->getScalarSizeInBits() / 8; + const SCEV *EltSizeSCEV = SE->getConstant(ScEnd->getType(), EltSize); + ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV); } Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc); @@ -1870,9 +1888,17 @@ std::pair LoopAccessInfo::addRuntimeChecks( Value *End0 = ChkBuilder.CreateBitCast(A.End, PtrArithTy1, "bc"); Value *End1 = ChkBuilder.CreateBitCast(B.End, PtrArithTy0, "bc"); - Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); + // [A|B].Start points to the first accessed byte under base [A|B]. + // [A|B].End points to the last accessed byte, plus one. + // There is no conflict when the intervals are disjoint: + // NoConflict = (B.Start >= A.End) || (A.Start >= B.End) + // + // bound0 = (B.Start < A.End) + // bound1 = (A.Start < B.End) + // IsConflict = bound0 & bound1 + Value *Cmp0 = ChkBuilder.CreateICmpULT(Start0, End1, "bound0"); FirstInst = getFirstInst(FirstInst, Cmp0, Loc); - Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); + Value *Cmp1 = ChkBuilder.CreateICmpULT(Start1, End0, "bound1"); FirstInst = getFirstInst(FirstInst, Cmp1, Loc); Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); FirstInst = getFirstInst(FirstInst, IsConflict, Loc); diff --git a/llvm/test/Analysis/LoopAccessAnalysis/memcheck-off-by-one-error.ll b/llvm/test/Analysis/LoopAccessAnalysis/memcheck-off-by-one-error.ll new file mode 100644 index 0000000..01813c8 --- /dev/null +++ b/llvm/test/Analysis/LoopAccessAnalysis/memcheck-off-by-one-error.ll @@ -0,0 +1,51 @@ +; RUN: opt -analyze --loop-accesses %s | FileCheck %s + +; This test verifies run-time boundary check of memory accesses. +; The original loop: +; void fastCopy(const char* src, char* op) { +; int len = 32; +; while (len > 0) { +; *(reinterpret_cast(op)) = *(reinterpret_cast(src)); +; src += 8; +; op += 8; +; len -= 8; +; } +; } +; Boundaries calculations before this patch: +; (Low: %src High: (24 + %src)) +; and the actual distance between two pointers was 31, (%op - %src = 31) +; IsConflict = (24 > 31) = false -> execution is directed to the vectorized loop. +; The loop was vectorized to 4, 32 byte memory access ( <4 x i64> ), +; store a value at *%op touched memory under *%src. + +;CHECK: Printing analysis 'Loop Access Analysis' for function 'fastCopy' +;CHECK: (Low: %op High: (32 + %op)) +;CHECK: (Low: %src High: (32 + %src)) + +define void @fastCopy(i8* nocapture readonly %src, i8* nocapture %op) { +entry: + br label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %len.addr.07 = phi i32 [ %sub, %while.body ], [ 32, %while.body.preheader ] + %op.addr.06 = phi i8* [ %add.ptr1, %while.body ], [ %op, %while.body.preheader ] + %src.addr.05 = phi i8* [ %add.ptr, %while.body ], [ %src, %while.body.preheader ] + %0 = bitcast i8* %src.addr.05 to i64* + %1 = load i64, i64* %0, align 8 + %2 = bitcast i8* %op.addr.06 to i64* + store i64 %1, i64* %2, align 8 + %add.ptr = getelementptr inbounds i8, i8* %src.addr.05, i64 8 + %add.ptr1 = getelementptr inbounds i8, i8* %op.addr.06, i64 8 + %sub = add nsw i32 %len.addr.07, -8 + %cmp = icmp sgt i32 %len.addr.07, 8 + br i1 %cmp, label %while.body, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + ret void +} diff --git a/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll b/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll index a9626f4d..2bae486 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll @@ -96,15 +96,15 @@ for.end: ; preds = %for.body ; CHECK-NEXT: %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %ind ; CHECK-NEXT: Grouped accesses: ; CHECK-NEXT: Group {{.*}}[[ZERO]]: -; CHECK-NEXT: (Low: %c High: (78 + %c)) +; CHECK-NEXT: (Low: %c High: (80 + %c)) ; CHECK-NEXT: Member: {(2 + %c),+,4} ; CHECK-NEXT: Member: {%c,+,4} ; CHECK-NEXT: Group {{.*}}[[ONE]]: -; CHECK-NEXT: (Low: %a High: (40 + %a)) +; CHECK-NEXT: (Low: %a High: (42 + %a)) ; CHECK-NEXT: Member: {(2 + %a),+,2} ; CHECK-NEXT: Member: {%a,+,2} ; CHECK-NEXT: Group {{.*}}[[TWO]]: -; CHECK-NEXT: (Low: %b High: (38 + %b)) +; CHECK-NEXT: (Low: %b High: (40 + %b)) ; CHECK-NEXT: Member: {%b,+,2} define void @testg(i16* %a, @@ -168,15 +168,15 @@ for.end: ; preds = %for.body ; CHECK-NEXT: %arrayidxB = getelementptr i16, i16* %b, i64 %ind ; CHECK-NEXT: Grouped accesses: ; CHECK-NEXT: Group {{.*}}[[ZERO]]: -; CHECK-NEXT: (Low: %c High: (78 + %c)) +; CHECK-NEXT: (Low: %c High: (80 + %c)) ; CHECK-NEXT: Member: {(2 + %c),+,4} ; CHECK-NEXT: Member: {%c,+,4} ; CHECK-NEXT: Group {{.*}}[[ONE]]: -; CHECK-NEXT: (Low: %a High: (40 + %a)) +; CHECK-NEXT: (Low: %a High: (42 + %a)) ; CHECK-NEXT: Member: {(2 + %a),+,2} ; CHECK-NEXT: Member: {%a,+,2} ; CHECK-NEXT: Group {{.*}}[[TWO]]: -; CHECK-NEXT: (Low: %b High: (38 + %b)) +; CHECK-NEXT: (Low: %b High: (40 + %b)) ; CHECK-NEXT: Member: {%b,+,2} define void @testh(i16* %a, @@ -247,13 +247,13 @@ for.end: ; preds = %for.body ; CHECK-NEXT: %arrayidxA2 = getelementptr i16, i16* %a, i64 %ind2 ; CHECK-NEXT: Grouped accesses: ; CHECK-NEXT: Group {{.*}}[[ZERO]]: -; CHECK-NEXT: (Low: ((2 * %offset) + %a) High: (9998 + (2 * %offset) + %a)) +; CHECK-NEXT: (Low: ((2 * %offset) + %a) High: (10000 + (2 * %offset) + %a)) ; CHECK-NEXT: Member: {((2 * %offset) + %a),+,2}<%for.body> ; CHECK-NEXT: Group {{.*}}[[ONE]]: -; CHECK-NEXT: (Low: %a High: (9998 + %a)) +; CHECK-NEXT: (Low: %a High: (10000 + %a)) ; CHECK-NEXT: Member: {%a,+,2}<%for.body> ; CHECK-NEXT: Group {{.*}}[[TWO]]: -; CHECK-NEXT: (Low: (20000 + %a) High: (29998 + %a)) +; CHECK-NEXT: (Low: (20000 + %a) High: (30000 + %a)) ; CHECK-NEXT: Member: {(20000 + %a),+,2}<%for.body> define void @testi(i16* %a, diff --git a/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll b/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll index 607e007..405a475 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll @@ -16,7 +16,7 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnueabi" ; CHECK: function 'f': -; CHECK: (Low: (20000 + %a) High: (60000 + %a)) +; CHECK: (Low: (20000 + %a) High: (60004 + %a)) @B = common global i32* null, align 8 @A = common global i32* null, align 8 @@ -59,7 +59,7 @@ for.end: ; preds = %for.body ; Here it is not obvious what the limits are, since 'step' could be negative. ; CHECK: Low: (-1 + (-1 * ((-60001 + (-1 * %a)) umax (-60001 + (40000 * %step) + (-1 * %a))))) -; CHECK: High: ((60000 + %a) umax (60000 + (-40000 * %step) + %a)) +; CHECK: High: (4 + ((60000 + %a) umax (60000 + (-40000 * %step) + %a))) define void @g(i64 %step) { entry: diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll index a3b5a59..e91d5b8 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll @@ -8,10 +8,10 @@ target triple = "x86_64-apple-macosx10.8.0" ;CHECK: br ;CHECK: getelementptr ;CHECK-DAG: getelementptr -;CHECK-DAG: icmp uge -;CHECK-DAG: icmp uge -;CHECK-DAG: icmp uge -;CHECK-DAG: icmp uge +;CHECK-DAG: icmp ugt +;CHECK-DAG: icmp ugt +;CHECK-DAG: icmp ugt +;CHECK-DAG: icmp ugt ;CHECK-DAG: and ;CHECK-DAG: and ;CHECK: br diff --git a/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll b/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll index 06d0002..3e79d47 100644 --- a/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll +++ b/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll @@ -36,7 +36,7 @@ for.end: ; preds = %for.body ; CHECK: ret i32 0 ; CHECK-NOTBAA-LABEL: @test1 -; CHECK-NOTBAA: icmp uge i32* +; CHECK-NOTBAA: icmp ugt i32* ; CHECK-NOTBAA: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa ; CHECK-NOTBAA: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa @@ -70,8 +70,8 @@ for.end: ; preds = %for.body ; required. Without TBAA, however, two checks are required. ; CHECK-LABEL: @test2 -; CHECK: icmp uge float* -; CHECK: icmp uge float* +; CHECK: icmp ugt float* +; CHECK: icmp ugt float* ; CHECK-NOT: icmp uge i32* ; CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa @@ -80,10 +80,10 @@ for.end: ; preds = %for.body ; CHECK: ret i32 0 ; CHECK-NOTBAA-LABEL: @test2 -; CHECK-NOTBAA: icmp uge float* -; CHECK-NOTBAA: icmp uge float* -; CHECK-NOTBAA-DAG: icmp uge float* -; CHECK-NOTBAA-DAG: icmp uge i32* +; CHECK-NOTBAA: icmp ugt float* +; CHECK-NOTBAA: icmp ugt float* +; CHECK-NOTBAA-DAG: icmp ugt float* +; CHECK-NOTBAA-DAG: icmp ugt i32* ; CHECK-NOTBAA: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa ; CHECK-NOTBAA: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 4, !tbaa diff --git a/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll b/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll index 1c65eed..ff6c250 100644 --- a/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll +++ b/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll @@ -8,15 +8,15 @@ ; CHECK-NEXT: Loop Versioning found to be beneficial ; ; CHECK: for.body3: -; CHECK-NEXT: %add86 = phi i32 [ %arrayidx7.promoted, %for.body3.ph ], [ %add8, %for.body3 ] +; CHECK-NEXT: %[[induction:.*]] = phi i32 [ %arrayidx7.promoted, %for.body3.ph ], [ %add8, %for.body3 ] ; CHECK-NEXT: %j.113 = phi i32 [ %j.016, %for.body3.ph ], [ %inc, %for.body3 ] ; CHECK-NEXT: %idxprom = zext i32 %j.113 to i64 ; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, i32* %var1, i64 %idxprom ; CHECK-NEXT: store i32 %add, i32* %arrayidx, align 4, !alias.scope !2, !noalias !2 -; CHECK-NEXT: %add8 = add nsw i32 %add86, %add +; CHECK-NEXT: %add8 = add nsw i32 %[[induction]], %add ; CHECK-NEXT: %inc = add nuw i32 %j.113, 1 ; CHECK-NEXT: %cmp2 = icmp ult i32 %inc, %itr -; CHECK-NEXT: br i1 %cmp2, label %for.body3, label %for.inc11.loopexit.loopexit5, !llvm.loop !5 +; CHECK-NEXT: br i1 %cmp2, label %for.body3, label %for.inc11.loopexit.loopexit6, !llvm.loop !5 define i32 @foo(i32* nocapture %var1, i32* nocapture readnone %var2, i32* nocapture %var3, i32 %itr) #0 { entry: %cmp14 = icmp eq i32 %itr, 0 -- 2.7.4