From 50dfc9e35d72bf783ebc514ad1e48bd4d0767c5d Mon Sep 17 00:00:00 2001 From: Igor Kirillov Date: Thu, 25 May 2023 12:53:17 +0000 Subject: [PATCH] [LoopLoadElimination] Add support for stride equal to -1 This patch allows us to gain all the benefits provided by LoopLoadElimination pass to descending loops. Differential Revision: https://reviews.llvm.org/D151448 --- llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp | 25 +++++++++++------ llvm/test/Transforms/LoopLoadElim/backward.ll | 31 ++++++++++++++++++++++ 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index e32b97e..179ccde 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -88,8 +88,9 @@ struct StoreToLoadForwardingCandidate { StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store) : Load(Load), Store(Store) {} - /// Return true if the dependence from the store to the load has a - /// distance of one. E.g. A[i+1] = A[i] + /// Return true if the dependence from the store to the load has an + /// absolute distance of one. + /// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop) bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, Loop *L) const { Value *LoadPtr = Load->getPointerOperand(); @@ -103,11 +104,19 @@ struct StoreToLoadForwardingCandidate { DL.getTypeSizeInBits(getLoadStoreType(Store)) && "Should be a known dependence"); - // Currently we only support accesses with unit stride. FIXME: we should be - // able to handle non unit stirde as well as long as the stride is equal to - // the dependence distance. - if (getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0) != 1 || - getPtrStride(PSE, LoadType, StorePtr, L).value_or(0) != 1) + int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0); + int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0); + if (!StrideLoad || !StrideStore || StrideLoad != StrideStore) + return false; + + // TODO: This check for stride values other than 1 and -1 can be eliminated. + // However, doing so may cause the LoopAccessAnalysis to overcompensate, + // generating numerous non-wrap runtime checks that may undermine the + // benefits of load elimination. To safely implement support for non-unit + // strides, we would need to ensure either that the processed case does not + // require these additional checks, or improve the LAA to handle them more + // efficiently, or potentially both. + if (std::abs(StrideLoad) != 1) return false; unsigned TypeByteSize = DL.getTypeAllocSize(const_cast(LoadType)); @@ -120,7 +129,7 @@ struct StoreToLoadForwardingCandidate { auto *Dist = cast( PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV)); const APInt &Val = Dist->getAPInt(); - return Val == TypeByteSize; + return Val == TypeByteSize * StrideLoad; } Value *getLoadPtr() const { return Load->getPointerOperand(); } diff --git a/llvm/test/Transforms/LoopLoadElim/backward.ll b/llvm/test/Transforms/LoopLoadElim/backward.ll index 01939df..e55d25d 100644 --- a/llvm/test/Transforms/LoopLoadElim/backward.ll +++ b/llvm/test/Transforms/LoopLoadElim/backward.ll @@ -30,3 +30,34 @@ for.body: ; preds = %for.body, %entry for.end: ; preds = %for.body ret void } + +; Same but loop is descending. +; +; for (unsigned i = N; i > 0; i--) +; A[i-1] = A[i] + B[i]; +define void @g(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, i64 %N) { +entry: +; CHECK: %0 = shl i64 %N, 2 +; CHECK: %scevgep = getelementptr i8, ptr %A, i64 %0 +; CHECK: %load_initial = load i32, ptr %scevgep, align 4 + br label %for.body + +for.body: ; preds = %for.body, %entry +; CHECK: %store_forwarded = phi i32 [ %load_initial, %entry ], [ %add, %for.body ] + %i.09 = phi i64 [ %sub, %for.body ], [ %N, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %A, i64 %i.09 + %load = load i32, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, ptr %B, i64 %i.09 + %load_1 = load i32, ptr %arrayidx1, align 4 +; CHECK: %add = add i32 %load_1, %store_forwarded + %add = add i32 %load_1, %load + %sub = add i64 %i.09, -1 + %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %sub + store i32 %add, ptr %arrayidx2, align 4 + %cmp.not = icmp eq i64 %sub, 0 + br i1 %cmp.not, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + -- 2.7.4