From 7cdf01ef58ac9d181fd02cdee69102d26ec24a2d Mon Sep 17 00:00:00 2001 From: Andrew Kaylor Date: Thu, 11 Aug 2016 18:28:33 +0000 Subject: [PATCH] Target independent codesize heuristics for Loop Idiom Recognition Patch by Sunita Marathe Differential Revision: https://reviews.llvm.org/D21449 llvm-svn: 278378 --- llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 48 +++++- .../LoopIdiom/lir-heurs-multi-block-loop.ll | 182 +++++++++++++++++++++ 2 files changed, 227 insertions(+), 3 deletions(-) create mode 100644 llvm/test/Transforms/LoopIdiom/lir-heurs-multi-block-loop.ll diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 90e8bfa..5d156b8 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -11,6 +11,12 @@ // non-loop form. In cases that this kicks in, it can be a significant // performance win. // +// If compiling for code size we avoid idiom recognition if the resulting +// code could be larger than the code for the original loop. One way this could +// happen is if the loop is not removable after idiom recognition due to the +// presence of non-idiom instructions. The initial implementation of the +// heuristics applies to idioms in multi-block loops. +// //===----------------------------------------------------------------------===// // // TODO List: @@ -65,6 +71,12 @@ using namespace llvm; STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); +static cl::opt UseLIRCodeSizeHeurs( + "use-lir-code-size-heurs", + cl::desc("Use loop idiom recognition code size heuristics when compiling" + "with -Os/-Oz"), + cl::init(true), cl::Hidden); + namespace { class LoopIdiomRecognize { @@ -76,6 +88,7 @@ class LoopIdiomRecognize { TargetLibraryInfo *TLI; const TargetTransformInfo *TTI; const DataLayout *DL; + bool ApplyCodeSizeHeuristics; public: explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT, @@ -117,8 +130,10 @@ private: Instruction *TheStore, SmallPtrSetImpl &Stores, const SCEVAddRecExpr *Ev, const SCEV *BECount, - bool NegStride); + bool NegStride, bool IsLoopMemset = false); bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount); + bool avoidLIRForMultiBlockLoop(bool IsMemset = false, + bool IsLoopMemset = false); /// @} /// \name Noncountable Loop Idiom Handling @@ -229,6 +244,10 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { if (Name == "memset" || Name == "memcpy") return false; + // Determine if code size heuristics need to be applied. + ApplyCodeSizeHeuristics = + L->getHeader()->getParent()->optForSize() && UseLIRCodeSizeHeurs; + HasMemset = TLI->has(LibFunc::memset); HasMemsetPattern = TLI->has(LibFunc::memset_pattern16); HasMemcpy = TLI->has(LibFunc::memcpy); @@ -689,7 +708,7 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, bool NegStride = SizeInBytes == -Stride; return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, MSI->getAlignment(), SplatValue, MSI, MSIs, Ev, - BECount, NegStride); + BECount, NegStride, /*IsLoopMemset=*/true); } /// mayLoopAccessLocation - Return true if the specified loop might access the @@ -745,7 +764,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, Value *StoredVal, Instruction *TheStore, SmallPtrSetImpl &Stores, const SCEVAddRecExpr *Ev, - const SCEV *BECount, bool NegStride) { + const SCEV *BECount, bool NegStride, bool IsLoopMemset) { Value *SplatValue = isBytewiseValue(StoredVal); Constant *PatternValue = nullptr; @@ -786,6 +805,9 @@ bool LoopIdiomRecognize::processLoopStridedStore( return false; } + if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset)) + return false; + // Okay, everything looks good, insert the memset. // The # stored bytes is (BECount+1)*Size. Expand the trip count out to @@ -917,6 +939,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, return false; } + if (avoidLIRForMultiBlockLoop()) + return false; + // Okay, everything is safe, we can transform this! // The # stored bytes is (BECount+1)*Size. Expand the trip count out to @@ -948,6 +973,23 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, return true; } +// When compiling for codesize we avoid idiom recognition for a multi-block loop +// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop. +// +bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, + bool IsLoopMemset) { + if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) { + if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) { + DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName() + << " : LIR " << (IsMemset ? "Memset" : "Memcpy") + << " avoided: multi-block top-level loop\n"); + return true; + } + } + + return false; +} + bool LoopIdiomRecognize::runOnNoncountableLoop() { return recognizePopcount(); } diff --git a/llvm/test/Transforms/LoopIdiom/lir-heurs-multi-block-loop.ll b/llvm/test/Transforms/LoopIdiom/lir-heurs-multi-block-loop.ll new file mode 100644 index 0000000..61c1469 --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/lir-heurs-multi-block-loop.ll @@ -0,0 +1,182 @@ +; RUN: opt -basicaa -loop-idiom -use-lir-code-size-heurs=true < %s -S | FileCheck %s + +; When compiling for codesize we avoid idiom recognition for a +; multi-block loop unless it is one of +; - a loop_memset idiom, or +; - a memset/memcpy idiom in a nested loop. + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) +@APPLES = common global i32 0, align 4 +@ORANGES = common global i32 0, align 4 + +; LIR allowed: loop_memset idiom in multi-block loop. +; =================================================== +; CHECK-LABEL: @LoopMemset +; CHECK: for.body.preheader: +; CHECK: call void @llvm.memset +; CHECK: for.body: +; +define i32 @LoopMemset([2048 x i8]* noalias nocapture %DST, i32 %SIZE) local_unnamed_addr optsize { +entry: + %cmp12 = icmp sgt i32 %SIZE, 0 + br i1 %cmp12, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.inc + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %BASKET.013 = phi i32 [ %BASKET.1, %for.inc ], [ 0, %for.body.preheader ] + %arraydecay = getelementptr inbounds [2048 x i8], [2048 x i8]* %DST, i64 %indvars.iv, i64 0 + tail call void @llvm.memset.p0i8.i64(i8* %arraydecay, i8 -1, i64 2048, i32 1, i1 false) + %0 = trunc i64 %indvars.iv to i32 + %rem11 = and i32 %0, 1 + %cmp1 = icmp eq i32 %rem11, 0 + %1 = load i32, i32* @ORANGES, align 4 + %2 = load i32, i32* @APPLES, align 4 + br i1 %cmp1, label %if.then, label %if.else + +if.else: ; preds = %for.body + %dec3 = add nsw i32 %2, -1 + store i32 %dec3, i32* @APPLES, align 4 + br label %for.inc + +if.then: ; preds = %for.body + %dec = add nsw i32 %1, -1 + store i32 %dec, i32* @ORANGES, align 4 + br label %for.inc + +for.inc: ; preds = %if.then, %if.else + %.pn = phi i32 [ %2, %if.then ], [ %1, %if.else ] + %BASKET.1 = add nsw i32 %.pn, %BASKET.013 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, %SIZE + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.inc + %BASKET.1.lcssa = phi i32 [ %BASKET.1, %for.inc ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %BASKET.0.lcssa = phi i32 [ 0, %entry ], [ %BASKET.1.lcssa, %for.end.loopexit ] + ret i32 %BASKET.0.lcssa +} + +; LIR allowed: memset idiom in multi-block nested loop, +; which is recognized as a loop_memset in its turn. +; ===================================================== +; CHECK-LABEL: @NestedMemset_LoopMemset +; CHECK: for.cond1.preheader.preheader: +; CHECK: call void @llvm.memset +; CHECK: for.cond1.preheader: +; +define i32 @NestedMemset_LoopMemset([2046 x i8]* noalias nocapture %DST, i32 %SIZE) local_unnamed_addr optsize { +entry: + %cmp25 = icmp sgt i32 %SIZE, 0 + br i1 %cmp25, label %for.cond1.preheader.preheader, label %for.end11 + +for.cond1.preheader.preheader: ; preds = %entry + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc9 + %i.027 = phi i32 [ %inc10, %for.inc9 ], [ 0, %for.cond1.preheader.preheader ] + %BASKET.026 = phi i32 [ %BASKET.2.lcssa, %for.inc9 ], [ 0, %for.cond1.preheader.preheader ] + %idxprom4 = sext i32 %i.027 to i64 + %rem22 = and i32 %i.027, 1 + %cmp6 = icmp eq i32 %rem22, 0 + br label %for.body3 + +for.body3: ; preds = %for.cond1.preheader, %for.inc + %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ] + %BASKET.123 = phi i32 [ %BASKET.026, %for.cond1.preheader ], [ %BASKET.2, %for.inc ] + %arrayidx5 = getelementptr inbounds [2046 x i8], [2046 x i8]* %DST, i64 %idxprom4, i64 %indvars.iv + store i8 -1, i8* %arrayidx5, align 1 + %0 = load i32, i32* @APPLES, align 4 + %1 = load i32, i32* @ORANGES, align 4 + br i1 %cmp6, label %if.then, label %if.else + +if.else: ; preds = %for.body3 + %dec8 = add nsw i32 %0, -1 + store i32 %dec8, i32* @APPLES, align 4 + br label %for.inc + +if.then: ; preds = %for.body3 + %dec = add nsw i32 %1, -1 + store i32 %dec, i32* @ORANGES, align 4 + br label %for.inc + +for.inc: ; preds = %if.then, %if.else + %.pn = phi i32 [ %0, %if.then ], [ %1, %if.else ] + %BASKET.2 = add nsw i32 %.pn, %BASKET.123 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 2046 + br i1 %exitcond, label %for.body3, label %for.inc9 + +for.inc9: ; preds = %for.inc + %BASKET.2.lcssa = phi i32 [ %BASKET.2, %for.inc ] + %inc10 = add nsw i32 %i.027, 1 + %cmp = icmp slt i32 %inc10, %SIZE + br i1 %cmp, label %for.cond1.preheader, label %for.end11.loopexit + +for.end11.loopexit: ; preds = %for.inc9 + %BASKET.2.lcssa.lcssa = phi i32 [ %BASKET.2.lcssa, %for.inc9 ] + br label %for.end11 + +for.end11: ; preds = %for.end11.loopexit, %entry + %BASKET.0.lcssa = phi i32 [ 0, %entry ], [ %BASKET.2.lcssa.lcssa, %for.end11.loopexit ] + ret i32 %BASKET.0.lcssa +} + +; LIR avoided: memset idiom in multi-block top-level loop. +; ======================================================== +; CHECK-LABEL: @Non_NestedMemset +; CHECK-NOT: call void @llvm.memset +; +define i32 @Non_NestedMemset(i8* noalias nocapture %DST, i32 %SIZE) local_unnamed_addr optsize { +entry: + %cmp12 = icmp sgt i32 %SIZE, 0 + br i1 %cmp12, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.inc + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %BASKET.013 = phi i32 [ %BASKET.1, %for.inc ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i8, i8* %DST, i64 %indvars.iv + store i8 -1, i8* %arrayidx, align 1 + %0 = trunc i64 %indvars.iv to i32 + %rem11 = and i32 %0, 1 + %cmp1 = icmp eq i32 %rem11, 0 + %1 = load i32, i32* @ORANGES, align 4 + %2 = load i32, i32* @APPLES, align 4 + br i1 %cmp1, label %if.then, label %if.else + +if.else: ; preds = %for.body + %dec3 = add nsw i32 %2, -1 + store i32 %dec3, i32* @APPLES, align 4 + br label %for.inc + +if.then: ; preds = %for.body + %dec = add nsw i32 %1, -1 + store i32 %dec, i32* @ORANGES, align 4 + br label %for.inc + +for.inc: ; preds = %if.then, %if.else + %.pn = phi i32 [ %2, %if.then ], [ %1, %if.else ] + %BASKET.1 = add nsw i32 %.pn, %BASKET.013 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, %SIZE + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.inc + %BASKET.1.lcssa = phi i32 [ %BASKET.1, %for.inc ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %BASKET.0.lcssa = phi i32 [ 0, %entry ], [ %BASKET.1.lcssa, %for.end.loopexit ] + ret i32 %BASKET.0.lcssa +} + -- 2.7.4