From deb5095833a834e0ef5f784138da53e66febff05 Mon Sep 17 00:00:00 2001 From: Han Zhu Date: Mon, 8 Feb 2021 17:24:25 -0800 Subject: [PATCH] [loop-idiom] Hoist loop memcpys to loop preheader Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: Blame Revision: Differential Revision: https://phabricator.intern.facebook.com/D26380397 --- llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 200 ++++++++++--- .../LoopIdiom/memcpy-debugify-remarks.ll | 2 +- llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll | 309 +++++++++++++++++++++ .../LoopIdiom/memset-debugify-remarks.ll | 2 +- 4 files changed, 468 insertions(+), 45 deletions(-) create mode 100644 llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 596caf5..9fabbd0 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -205,6 +205,13 @@ private: enum class ForMemset { No, Yes }; bool processLoopStores(SmallVectorImpl &SL, const SCEV *BECount, ForMemset For); + + template + bool processLoopMemIntrinsic( + BasicBlock *BB, + bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *), + const SCEV *BECount); + bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount); bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, @@ -214,6 +221,13 @@ private: const SCEVAddRecExpr *Ev, const SCEV *BECount, bool NegStride, bool IsLoopMemset = false); bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount); + bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr, + unsigned StoreSize, MaybeAlign StoreAlign, + MaybeAlign LoadAlign, Instruction *TheStore, + Instruction *TheLoad, + const SCEVAddRecExpr *StoreEv, + const SCEVAddRecExpr *LoadEv, + const SCEV *BECount); bool avoidLIRForMultiBlockLoop(bool IsMemset = false, bool IsLoopMemset = false); @@ -628,22 +642,10 @@ bool LoopIdiomRecognize::runOnLoopBlock( for (auto &SI : StoreRefsForMemcpy) MadeChange |= processLoopStoreOfLoopLoad(SI, BECount); - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { - Instruction *Inst = &*I++; - // Look for memset instructions, which may be optimized to a larger memset. - if (MemSetInst *MSI = dyn_cast(Inst)) { - WeakTrackingVH InstPtr(&*I); - if (!processLoopMemSet(MSI, BECount)) - continue; - MadeChange = true; - - // If processing the memset invalidated our iterator, start over from the - // top of the block. - if (!InstPtr) - I = BB->begin(); - continue; - } - } + MadeChange |= processLoopMemIntrinsic( + BB, &LoopIdiomRecognize::processLoopMemCpy, BECount); + MadeChange |= processLoopMemIntrinsic( + BB, &LoopIdiomRecognize::processLoopMemSet, BECount); return MadeChange; } @@ -792,6 +794,80 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl &SL, return Changed; } +/// processLoopMemIntrinsic - Template function for calling different processor +/// functions based on mem instrinsic type. +template +bool LoopIdiomRecognize::processLoopMemIntrinsic( + BasicBlock *BB, + bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *), + const SCEV *BECount) { + bool MadeChange = false; + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { + Instruction *Inst = &*I++; + // Look for memory instructions, which may be optimized to a larger one. + if (MemInst *MI = dyn_cast(Inst)) { + WeakTrackingVH InstPtr(&*I); + if (!(this->*Processor)(MI, BECount)) + continue; + MadeChange = true; + + // If processing the instruction invalidated our iterator, start over from + // the top of the block. + if (!InstPtr) + I = BB->begin(); + continue; + } + } + return MadeChange; +} + +/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy +bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI, + const SCEV *BECount) { + // We can only handle non-volatile memcpys with a constant size. + if (MCI->isVolatile() || !isa(MCI->getLength())) + return false; + + // If we're not allowed to hack on memcpy, we fail. + if (!HasMemcpy || DisableLIRP::Memcpy) + return false; + + Value *Dest = MCI->getDest(); + Value *Source = MCI->getSource(); + if (!Dest || !Source) + return false; + + // See if the load and store pointer expressions are AddRec like {base,+,1} on + // the current loop, which indicates a strided load and store. If we have + // something else, it's a random load or store we can't handle. + const SCEVAddRecExpr *StoreEv = dyn_cast(SE->getSCEV(Dest)); + if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) + return false; + const SCEVAddRecExpr *LoadEv = dyn_cast(SE->getSCEV(Source)); + if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine()) + return false; + + // Reject memcpys that are so large that they overflow an unsigned. + uint64_t SizeInBytes = cast(MCI->getLength())->getZExtValue(); + if ((SizeInBytes >> 32) != 0) + return false; + + // Check if the stride matches the size of the memcpy. If so, then we know + // that every byte is touched in the loop. + const SCEVConstant *ConstStride = + dyn_cast(StoreEv->getOperand(1)); + if (!ConstStride) + return false; + + APInt Stride = ConstStride->getAPInt(); + if (SizeInBytes != Stride && SizeInBytes != -Stride) + return false; + + return processLoopStoreOfLoopLoad(Dest, Source, (unsigned)SizeInBytes, + MCI->getDestAlign(), MCI->getSourceAlign(), + MCI, MCI, StoreEv, LoadEv, BECount); +} + /// processLoopMemSet - See if this memset can be promoted to a large memset. bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { @@ -800,7 +876,7 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, return false; // If we're not allowed to hack on memset, we fail. - if (!HasMemset) + if (!HasMemset || DisableLIRP::Memset) return false; Value *Pointer = MSI->getDest(); @@ -1040,9 +1116,11 @@ bool LoopIdiomRecognize::processLoopStridedStore( ORE.emit([&]() { return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore", NewCall->getDebugLoc(), Preheader) - << "Transformed loop-strided store into a call to " + << "Transformed loop-strided store in " + << ore::NV("Function", TheStore->getFunction()) + << " function into a call to " << ore::NV("NewFunction", NewCall->getCalledFunction()) - << "() function"; + << "() intrinsic"; }); // Okay, the memset has been formed. Zap the original store and anything that @@ -1068,20 +1146,25 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, Value *StorePtr = SI->getPointerOperand(); const SCEVAddRecExpr *StoreEv = cast(SE->getSCEV(StorePtr)); - APInt Stride = getStoreStride(StoreEv); unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType()); - bool NegStride = StoreSize == -Stride; // The store must be feeding a non-volatile load. LoadInst *LI = cast(SI->getValueOperand()); assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads."); - // See if the pointer expression is an AddRec like {base,+,1} on the current - // loop, which indicates a strided load. If we have something else, it's a - // random load we can't handle. const SCEVAddRecExpr *LoadEv = cast(SE->getSCEV(LI->getPointerOperand())); + Value *LoadPtr = LI->getPointerOperand(); + return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSize, + SI->getAlign(), LI->getAlign(), SI, LI, + StoreEv, LoadEv, BECount); +} +bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( + Value *DestPtr, Value *SourcePtr, unsigned StoreSize, MaybeAlign StoreAlign, + MaybeAlign LoadAlign, Instruction *TheStore, Instruction *TheLoad, + const SCEVAddRecExpr *StoreEv, const SCEVAddRecExpr *LoadEv, + const SCEV *BECount) { // The trip count of the loop and the base pointer of the addrec SCEV is // guaranteed to be loop invariant, which means that it should dominate the // header. This allows us to insert code for it in the preheader. @@ -1093,9 +1176,12 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, bool Changed = false; const SCEV *StrStart = StoreEv->getStart(); - unsigned StrAS = SI->getPointerAddressSpace(); + unsigned StrAS = DestPtr->getType()->getPointerAddressSpace(); Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS)); + APInt Stride = getStoreStride(StoreEv); + bool NegStride = StoreSize == -Stride; + // Handle negative strided loops. if (NegStride) StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE); @@ -1119,13 +1205,26 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, Changed = true; SmallPtrSet Stores; - Stores.insert(SI); + Stores.insert(TheStore); + + bool IsMemCpy = isa(TheStore); + const std::string InstRemark = IsMemCpy ? "memcpy" : "load and store"; + if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount, - StoreSize, *AA, Stores)) + StoreSize, *AA, Stores)) { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore", + TheStore) + << ore::NV("Inst", InstRemark) << " in " + << ore::NV("Function", TheStore->getFunction()) + << " function will not be hoisted: " + << ore::NV("Reason", "The loop may access store location"); + }); return Changed; + } const SCEV *LdStart = LoadEv->getStart(); - unsigned LdAS = LI->getPointerAddressSpace(); + unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace(); // Handle negative strided loops. if (NegStride) @@ -1136,9 +1235,21 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, Value *LoadBasePtr = Expander.expandCodeFor( LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); + // If the store is a memcpy instruction, we must check if it will write to + // the load memory locations. So remove it from the ignored stores. + if (IsMemCpy) + Stores.erase(TheStore); if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount, - StoreSize, *AA, Stores)) + StoreSize, *AA, Stores)) { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad) + << ore::NV("Inst", InstRemark) << " in " + << ore::NV("Function", TheStore->getFunction()) + << " function will not be hoisted: " + << ore::NV("Reason", "The loop may access load location"); + }); return Changed; + } if (avoidLIRForMultiBlockLoop()) return Changed; @@ -1155,15 +1266,15 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, // Check whether to generate an unordered atomic memcpy: // If the load or store are atomic, then they must necessarily be unordered // by previous checks. - if (!SI->isAtomic() && !LI->isAtomic()) - NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr, - LI->getAlign(), NumBytes); + if (!TheStore->isAtomic() && !TheLoad->isAtomic()) + NewCall = Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, + LoadAlign, NumBytes); else { // We cannot allow unaligned ops for unordered load/store, so reject // anything where the alignment isn't at least the element size. - const Align StoreAlign = SI->getAlign(); - const Align LoadAlign = LI->getAlign(); - if (StoreAlign < StoreSize || LoadAlign < StoreSize) + assert((StoreAlign.hasValue() && LoadAlign.hasValue()) && + "Expect unordered load/store to have align."); + if (StoreAlign.getValue() < StoreSize || LoadAlign.getValue() < StoreSize) return Changed; // If the element.atomic memcpy is not lowered into explicit @@ -1177,10 +1288,10 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, // Note that unordered atomic loads/stores are *required* by the spec to // have an alignment but non-atomic loads/stores may not. NewCall = Builder.CreateElementUnorderedAtomicMemCpy( - StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes, - StoreSize); + StoreBasePtr, StoreAlign.getValue(), LoadBasePtr, LoadAlign.getValue(), + NumBytes, StoreSize); } - NewCall->setDebugLoc(SI->getDebugLoc()); + NewCall->setDebugLoc(TheStore->getDebugLoc()); if (MSSAU) { MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB( @@ -1189,8 +1300,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, } LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" - << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" - << " from store ptr=" << *StoreEv << " at: " << *SI + << " from load ptr=" << *LoadEv << " at: " << *TheLoad + << "\n" + << " from store ptr=" << *StoreEv << " at: " << *TheStore << "\n"); ORE.emit([&]() { @@ -1198,14 +1310,16 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, NewCall->getDebugLoc(), Preheader) << "Formed a call to " << ore::NV("NewFunction", NewCall->getCalledFunction()) - << "() function"; + << "() intrinsic from " << ore::NV("Inst", InstRemark) + << " instruction in " << ore::NV("Function", TheStore->getFunction()) + << " function"; }); // Okay, the memcpy has been formed. Zap the original store and anything that // feeds into it. if (MSSAU) - MSSAU->removeMemoryAccess(SI, true); - deleteDeadInstruction(SI); + MSSAU->removeMemoryAccess(TheStore, true); + deleteDeadInstruction(TheStore); if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); ++NumMemCpy; diff --git a/llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll b/llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll index 3578540..6f817f2 100644 --- a/llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll +++ b/llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll @@ -6,7 +6,7 @@ target triple = "x86_64-unknown-linux-gnu" ; Check that everything still works when debuginfo is present, and that it is reasonably propagated. -; CHECK: remark: :6:1: Formed a call to llvm.memcpy.p0i8.p0i8.i64() function +; CHECK: remark: :6:1: Formed a call to llvm.memcpy.p0i8.p0i8.i64() intrinsic from load and store instruction in test6_dest_align function define void @test6_dest_align(i32* noalias align 1 %Base, i32* noalias align 4 %Dest, i64 %Size) nounwind ssp { ; CHECK-LABEL: @test6_dest_align( diff --git a/llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll b/llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll new file mode 100644 index 0000000..bb0d68e --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll @@ -0,0 +1,309 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-idiom < %s -S | FileCheck %s + +%struct.S = type { i32, i32, i8 } + +; unsigned copy_noalias(S* __restrict a, S *b, int n) { +; for (int i = 0; i < n; i++) { +; a[i] = b[i]; +; } +; return sizeof(a[0]); +; } + +; Function Attrs: nofree nounwind uwtable mustprogress +define dso_local i32 @copy_noalias(%struct.S* noalias nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @copy_noalias( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A1:%.*]] = bitcast %struct.S* [[A:%.*]] to i8* +; CHECK-NEXT: [[B2:%.*]] = bitcast %struct.S* [[B:%.*]] to i8* +; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i64 [[TMP0]], 12 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A1]], i8* align 4 [[B2]], i64 [[TMP1]], i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 12 +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.S* [[ARRAYIDX2]] to i8* +; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.S* [[ARRAYIDX]] to i8* +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] +; +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret i32 12 + +for.body: ; preds = %for.body.preheader, %for.body + %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %idxprom = zext i32 %i.08 to i64 + %arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom + %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom + %0 = bitcast %struct.S* %arrayidx2 to i8* + %1 = bitcast %struct.S* %arrayidx to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %0, i8* nonnull align 4 dereferenceable(12) %1, i64 12, i1 false) + %inc = add nuw nsw i32 %i.08, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +; unsigned copy_may_alias(S *a, S *b, int n) { +; for (int i = 0; i < n; i++) { +; a[i] = b[i]; +; } +; return sizeof(a[0]); +; } + +; Function Attrs: nofree nounwind uwtable mustprogress +define dso_local i32 @copy_may_alias(%struct.S* nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @copy_may_alias( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 12 +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[B:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[A:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.S* [[ARRAYIDX2]] to i8* +; CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.S* [[ARRAYIDX]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) [[TMP0]], i8* nonnull align 4 dereferenceable(12) [[TMP1]], i64 12, i1 false) +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] +; +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret i32 12 + +for.body: ; preds = %for.body.preheader, %for.body + %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %idxprom = zext i32 %i.08 to i64 + %arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom + %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom + %0 = bitcast %struct.S* %arrayidx2 to i8* + %1 = bitcast %struct.S* %arrayidx to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %0, i8* nonnull align 4 dereferenceable(12) %1, i64 12, i1 false) + %inc = add nuw nsw i32 %i.08, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +%struct.R = type <{ i8, i32, i8 }> + +; void copy_noalias_read(S* __restrict x, S* __restrict y, int n, int &s) { +; for (int i = 0; i < n; i++) { +; x[i] = y[i]; +; s += y[i].b; +; } +; } + +; Function Attrs: nofree nounwind uwtable mustprogress +define dso_local void @copy_noalias_read(%struct.R* noalias nocapture %x, %struct.R* noalias nocapture readonly %y, i32 %n, i32* nocapture nonnull align 4 dereferenceable(4) %s) local_unnamed_addr #0 { +; CHECK-LABEL: @copy_noalias_read( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X1:%.*]] = bitcast %struct.R* [[X:%.*]] to i8* +; CHECK-NEXT: [[Y2:%.*]] = bitcast %struct.R* [[Y:%.*]] to i8* +; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[S_PROMOTED:%.*]] = load i32, i32* [[S:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i64 [[TMP0]], 6 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[X1]], i8* align 1 [[Y2]], i64 [[TMP1]], i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.for.cond.cleanup_crit_edge: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: store i32 [[ADD_LCSSA]], i32* [[S]], align 4 +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[ADD13:%.*]] = phi i32 [ [[S_PROMOTED]], [[FOR_BODY_LR_PH]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_012]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_R:%.*]], %struct.R* [[X]], i64 [[IDXPROM]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_R]], %struct.R* [[Y]], i64 [[IDXPROM]], i32 0 +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_R]], %struct.R* [[Y]], i64 [[IDXPROM]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[B]], align 1 +; CHECK-NEXT: [[ADD]] = add nsw i32 [[ADD13]], [[TMP4]] +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]] +; +entry: + %cmp11 = icmp sgt i32 %n, 0 + br i1 %cmp11, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %s.promoted = load i32, i32* %s, align 4 + br label %for.body + +for.cond.for.cond.cleanup_crit_edge: ; preds = %for.body + %add.lcssa = phi i32 [ %add, %for.body ] + store i32 %add.lcssa, i32* %s, align 4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %add13 = phi i32 [ %s.promoted, %for.body.lr.ph ], [ %add, %for.body ] + %i.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %idxprom = zext i32 %i.012 to i64 + %0 = getelementptr inbounds %struct.R, %struct.R* %x, i64 %idxprom, i32 0 + %1 = getelementptr inbounds %struct.R, %struct.R* %y, i64 %idxprom, i32 0 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(6) %0, i8* nonnull align 1 dereferenceable(6) %1, i64 6, i1 false) + %b = getelementptr inbounds %struct.R, %struct.R* %y, i64 %idxprom, i32 1 + %2 = load i32, i32* %b, align 1 + %add = add nsw i32 %add13, %2 + %inc = add nuw nsw i32 %i.012, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %for.body, label %for.cond.for.cond.cleanup_crit_edge +} + +%struct.SPacked = type <{ i32, i32, i8 }> + +; Function Attrs: nofree nounwind uwtable mustprogress +define dso_local i32 @copy_noalias_packed(%struct.SPacked* noalias nocapture %a, %struct.SPacked* nocapture readonly %b, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @copy_noalias_packed( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A1:%.*]] = bitcast %struct.SPacked* [[A:%.*]] to i8* +; CHECK-NEXT: [[B2:%.*]] = bitcast %struct.SPacked* [[B:%.*]] to i8* +; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i64 [[TMP0]], 9 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[A1]], i8* align 1 [[B2]], i64 [[TMP1]], i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 9 +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_SPACKED:%.*]], %struct.SPacked* [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_SPACKED]], %struct.SPacked* [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.SPacked* [[ARRAYIDX2]] to i8* +; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.SPacked* [[ARRAYIDX]] to i8* +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] +; +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret i32 9 + +for.body: ; preds = %for.body.preheader, %for.body + %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %idxprom = zext i32 %i.08 to i64 + %arrayidx = getelementptr inbounds %struct.SPacked, %struct.SPacked* %b, i64 %idxprom + %arrayidx2 = getelementptr inbounds %struct.SPacked, %struct.SPacked* %a, i64 %idxprom + %0 = bitcast %struct.SPacked* %arrayidx2 to i8* + %1 = bitcast %struct.SPacked* %arrayidx to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(9) %0, i8* nonnull align 1 dereferenceable(9) %1, i64 9, i1 false) + %inc = add nuw nsw i32 %i.08, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +%struct.SAligned = type { i32, i32, i8, [7 x i8] } + +define dso_local i32 @copy_noalias_aligned(%struct.SAligned* noalias nocapture %a, %struct.SAligned* nocapture readonly %b, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @copy_noalias_aligned( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A1:%.*]] = bitcast %struct.SAligned* [[A:%.*]] to i8* +; CHECK-NEXT: [[B2:%.*]] = bitcast %struct.SAligned* [[B:%.*]] to i8* +; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A1]], i8* align 16 [[B2]], i64 [[TMP1]], i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 16 +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_SALIGNED:%.*]], %struct.SAligned* [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_SALIGNED]], %struct.SAligned* [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.SAligned* [[ARRAYIDX2]] to i8* +; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.SAligned* [[ARRAYIDX]] to i8* +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] +; +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret i32 16 + +for.body: ; preds = %for.body.preheader, %for.body + %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %idxprom = zext i32 %i.08 to i64 + %arrayidx = getelementptr inbounds %struct.SAligned, %struct.SAligned* %b, i64 %idxprom + %arrayidx2 = getelementptr inbounds %struct.SAligned, %struct.SAligned* %a, i64 %idxprom + %0 = bitcast %struct.SAligned* %arrayidx2 to i8* + %1 = bitcast %struct.SAligned* %arrayidx to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 16 dereferenceable(16) %0, i8* nonnull align 16 dereferenceable(16) %1, i64 16, i1 false) + %inc = add nuw nsw i32 %i.08, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +; Function Attrs: argmemonly nofree nosync nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1 diff --git a/llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll b/llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll index 06e17fe..b7a866f 100644 --- a/llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll +++ b/llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll @@ -11,7 +11,7 @@ target triple = "x86_64-unknown-linux-gnu" ; *begin = value; ; } -; CHECK: remark: :4:1: Transformed loop-strided store into a call to llvm.memset.p0i8.i64() function +; CHECK: remark: :4:1: Transformed loop-strided store in _Z15my_basic_memsetPcS_c function into a call to llvm.memset.p0i8.i64() intrinsic define void @_Z15my_basic_memsetPcS_c(i8* %ptr, i8* %end, i8 %value) { ; CHECK-LABEL: @_Z15my_basic_memsetPcS_c( -- 2.7.4