From deb5095833a834e0ef5f784138da53e66febff05 Mon Sep 17 00:00:00 2001
From: Han Zhu <zhuhan@fb.com>
Date: Mon, 8 Feb 2021 17:24:25 -0800
Subject: [PATCH] [loop-idiom] Hoist loop memcpys to loop preheader

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

Blame Revision:

Differential Revision: https://phabricator.intern.facebook.com/D26380397
---
 llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp  | 200 ++++++++++---
 .../LoopIdiom/memcpy-debugify-remarks.ll           |   2 +-
 llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll | 309 +++++++++++++++++++++
 .../LoopIdiom/memset-debugify-remarks.ll           |   2 +-
 4 files changed, 468 insertions(+), 45 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 596caf5..9fabbd0 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -205,6 +205,13 @@ private:
   enum class ForMemset { No, Yes };
   bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
                          ForMemset For);
+
+  template <typename MemInst>
+  bool processLoopMemIntrinsic(
+      BasicBlock *BB,
+      bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
+      const SCEV *BECount);
+  bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
   bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
 
   bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
@@ -214,6 +221,13 @@ private:
                                const SCEVAddRecExpr *Ev, const SCEV *BECount,
                                bool NegStride, bool IsLoopMemset = false);
   bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
+  bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
+                                  unsigned StoreSize, MaybeAlign StoreAlign,
+                                  MaybeAlign LoadAlign, Instruction *TheStore,
+                                  Instruction *TheLoad,
+                                  const SCEVAddRecExpr *StoreEv,
+                                  const SCEVAddRecExpr *LoadEv,
+                                  const SCEV *BECount);
   bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
                                  bool IsLoopMemset = false);
 
@@ -628,22 +642,10 @@ bool LoopIdiomRecognize::runOnLoopBlock(
   for (auto &SI : StoreRefsForMemcpy)
     MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
 
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
-    Instruction *Inst = &*I++;
-    // Look for memset instructions, which may be optimized to a larger memset.
-    if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
-      WeakTrackingVH InstPtr(&*I);
-      if (!processLoopMemSet(MSI, BECount))
-        continue;
-      MadeChange = true;
-
-      // If processing the memset invalidated our iterator, start over from the
-      // top of the block.
-      if (!InstPtr)
-        I = BB->begin();
-      continue;
-    }
-  }
+  MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
+      BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
+  MadeChange |= processLoopMemIntrinsic<MemSetInst>(
+      BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
 
   return MadeChange;
 }
@@ -792,6 +794,80 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
   return Changed;
 }
 
+/// processLoopMemIntrinsic - Template function for calling different processor
+/// functions based on mem instrinsic type.
+template <typename MemInst>
+bool LoopIdiomRecognize::processLoopMemIntrinsic(
+    BasicBlock *BB,
+    bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
+    const SCEV *BECount) {
+  bool MadeChange = false;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+    Instruction *Inst = &*I++;
+    // Look for memory instructions, which may be optimized to a larger one.
+    if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
+      WeakTrackingVH InstPtr(&*I);
+      if (!(this->*Processor)(MI, BECount))
+        continue;
+      MadeChange = true;
+
+      // If processing the instruction invalidated our iterator, start over from
+      // the top of the block.
+      if (!InstPtr)
+        I = BB->begin();
+      continue;
+    }
+  }
+  return MadeChange;
+}
+
+/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
+bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
+                                           const SCEV *BECount) {
+  // We can only handle non-volatile memcpys with a constant size.
+  if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
+    return false;
+
+  // If we're not allowed to hack on memcpy, we fail.
+  if (!HasMemcpy || DisableLIRP::Memcpy)
+    return false;
+
+  Value *Dest = MCI->getDest();
+  Value *Source = MCI->getSource();
+  if (!Dest || !Source)
+    return false;
+
+  // See if the load and store pointer expressions are AddRec like {base,+,1} on
+  // the current loop, which indicates a strided load and store.  If we have
+  // something else, it's a random load or store we can't handle.
+  const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
+  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
+    return false;
+  const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
+  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
+    return false;
+
+  // Reject memcpys that are so large that they overflow an unsigned.
+  uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
+  if ((SizeInBytes >> 32) != 0)
+    return false;
+
+  // Check if the stride matches the size of the memcpy. If so, then we know
+  // that every byte is touched in the loop.
+  const SCEVConstant *ConstStride =
+      dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
+  if (!ConstStride)
+    return false;
+
+  APInt Stride = ConstStride->getAPInt();
+  if (SizeInBytes != Stride && SizeInBytes != -Stride)
+    return false;
+
+  return processLoopStoreOfLoopLoad(Dest, Source, (unsigned)SizeInBytes,
+                                    MCI->getDestAlign(), MCI->getSourceAlign(),
+                                    MCI, MCI, StoreEv, LoadEv, BECount);
+}
+
 /// processLoopMemSet - See if this memset can be promoted to a large memset.
 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
                                            const SCEV *BECount) {
@@ -800,7 +876,7 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
     return false;
 
   // If we're not allowed to hack on memset, we fail.
-  if (!HasMemset)
+  if (!HasMemset || DisableLIRP::Memset)
     return false;
 
   Value *Pointer = MSI->getDest();
@@ -1040,9 +1116,11 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   ORE.emit([&]() {
     return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore",
                               NewCall->getDebugLoc(), Preheader)
-           << "Transformed loop-strided store into a call to "
+           << "Transformed loop-strided store in "
+           << ore::NV("Function", TheStore->getFunction())
+           << " function into a call to "
            << ore::NV("NewFunction", NewCall->getCalledFunction())
-           << "() function";
+           << "() intrinsic";
   });
 
   // Okay, the memset has been formed.  Zap the original store and anything that
@@ -1068,20 +1146,25 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
 
   Value *StorePtr = SI->getPointerOperand();
   const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
-  APInt Stride = getStoreStride(StoreEv);
   unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
-  bool NegStride = StoreSize == -Stride;
 
   // The store must be feeding a non-volatile load.
   LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
   assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
 
-  // See if the pointer expression is an AddRec like {base,+,1} on the current
-  // loop, which indicates a strided load.  If we have something else, it's a
-  // random load we can't handle.
   const SCEVAddRecExpr *LoadEv =
       cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+  Value *LoadPtr = LI->getPointerOperand();
+  return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSize,
+                                    SI->getAlign(), LI->getAlign(), SI, LI,
+                                    StoreEv, LoadEv, BECount);
+}
 
+bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
+    Value *DestPtr, Value *SourcePtr, unsigned StoreSize, MaybeAlign StoreAlign,
+    MaybeAlign LoadAlign, Instruction *TheStore, Instruction *TheLoad,
+    const SCEVAddRecExpr *StoreEv, const SCEVAddRecExpr *LoadEv,
+    const SCEV *BECount) {
   // The trip count of the loop and the base pointer of the addrec SCEV is
   // guaranteed to be loop invariant, which means that it should dominate the
   // header.  This allows us to insert code for it in the preheader.
@@ -1093,9 +1176,12 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
 
   bool Changed = false;
   const SCEV *StrStart = StoreEv->getStart();
-  unsigned StrAS = SI->getPointerAddressSpace();
+  unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
   Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
 
+  APInt Stride = getStoreStride(StoreEv);
+  bool NegStride = StoreSize == -Stride;
+
   // Handle negative strided loops.
   if (NegStride)
     StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE);
@@ -1119,13 +1205,26 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   Changed = true;
 
   SmallPtrSet<Instruction *, 1> Stores;
-  Stores.insert(SI);
+  Stores.insert(TheStore);
+
+  bool IsMemCpy = isa<MemCpyInst>(TheStore);
+  const std::string InstRemark = IsMemCpy ? "memcpy" : "load and store";
+
   if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
-                            StoreSize, *AA, Stores))
+                            StoreSize, *AA, Stores)) {
+    ORE.emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
+                                      TheStore)
+             << ore::NV("Inst", InstRemark) << " in "
+             << ore::NV("Function", TheStore->getFunction())
+             << " function will not be hoisted: "
+             << ore::NV("Reason", "The loop may access store location");
+    });
     return Changed;
+  }
 
   const SCEV *LdStart = LoadEv->getStart();
-  unsigned LdAS = LI->getPointerAddressSpace();
+  unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
 
   // Handle negative strided loops.
   if (NegStride)
@@ -1136,9 +1235,21 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   Value *LoadBasePtr = Expander.expandCodeFor(
       LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
 
+  // If the store is a memcpy instruction, we must check if it will write to
+  // the load memory locations. So remove it from the ignored stores.
+  if (IsMemCpy)
+    Stores.erase(TheStore);
   if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
-                            StoreSize, *AA, Stores))
+                            StoreSize, *AA, Stores)) {
+    ORE.emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
+             << ore::NV("Inst", InstRemark) << " in "
+             << ore::NV("Function", TheStore->getFunction())
+             << " function will not be hoisted: "
+             << ore::NV("Reason", "The loop may access load location");
+    });
     return Changed;
+  }
 
   if (avoidLIRForMultiBlockLoop())
     return Changed;
@@ -1155,15 +1266,15 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   // Check whether to generate an unordered atomic memcpy:
   //  If the load or store are atomic, then they must necessarily be unordered
   //  by previous checks.
-  if (!SI->isAtomic() && !LI->isAtomic())
-    NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr,
-                                   LI->getAlign(), NumBytes);
+  if (!TheStore->isAtomic() && !TheLoad->isAtomic())
+    NewCall = Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr,
+                                   LoadAlign, NumBytes);
   else {
     // We cannot allow unaligned ops for unordered load/store, so reject
     // anything where the alignment isn't at least the element size.
-    const Align StoreAlign = SI->getAlign();
-    const Align LoadAlign = LI->getAlign();
-    if (StoreAlign < StoreSize || LoadAlign < StoreSize)
+    assert((StoreAlign.hasValue() && LoadAlign.hasValue()) &&
+           "Expect unordered load/store to have align.");
+    if (StoreAlign.getValue() < StoreSize || LoadAlign.getValue() < StoreSize)
       return Changed;
 
     // If the element.atomic memcpy is not lowered into explicit
@@ -1177,10 +1288,10 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
     // Note that unordered atomic loads/stores are *required* by the spec to
     // have an alignment but non-atomic loads/stores may not.
     NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
-        StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
-        StoreSize);
+        StoreBasePtr, StoreAlign.getValue(), LoadBasePtr, LoadAlign.getValue(),
+        NumBytes, StoreSize);
   }
-  NewCall->setDebugLoc(SI->getDebugLoc());
+  NewCall->setDebugLoc(TheStore->getDebugLoc());
 
   if (MSSAU) {
     MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
@@ -1189,8 +1300,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   }
 
   LLVM_DEBUG(dbgs() << "  Formed memcpy: " << *NewCall << "\n"
-                    << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
-                    << "    from store ptr=" << *StoreEv << " at: " << *SI
+                    << "    from load ptr=" << *LoadEv << " at: " << *TheLoad
+                    << "\n"
+                    << "    from store ptr=" << *StoreEv << " at: " << *TheStore
                     << "\n");
 
   ORE.emit([&]() {
@@ -1198,14 +1310,16 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
                               NewCall->getDebugLoc(), Preheader)
            << "Formed a call to "
            << ore::NV("NewFunction", NewCall->getCalledFunction())
-           << "() function";
+           << "() intrinsic from " << ore::NV("Inst", InstRemark)
+           << " instruction in " << ore::NV("Function", TheStore->getFunction())
+           << " function";
   });
 
   // Okay, the memcpy has been formed.  Zap the original store and anything that
   // feeds into it.
   if (MSSAU)
-    MSSAU->removeMemoryAccess(SI, true);
-  deleteDeadInstruction(SI);
+    MSSAU->removeMemoryAccess(TheStore, true);
+  deleteDeadInstruction(TheStore);
   if (MSSAU && VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
   ++NumMemCpy;
diff --git a/llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll b/llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll
index 3578540..6f817f2 100644
--- a/llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll
+++ b/llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; Check that everything still works when debuginfo is present, and that it is reasonably propagated.
 
-; CHECK: remark: <stdin>:6:1: Formed a call to llvm.memcpy.p0i8.p0i8.i64() function
+; CHECK: remark: <stdin>:6:1: Formed a call to llvm.memcpy.p0i8.p0i8.i64() intrinsic from load and store instruction in test6_dest_align function
 
 define void @test6_dest_align(i32* noalias align 1 %Base, i32* noalias align 4 %Dest, i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test6_dest_align(
diff --git a/llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll b/llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll
new file mode 100644
index 0000000..bb0d68e
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll
@@ -0,0 +1,309 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-idiom < %s -S | FileCheck %s
+
+%struct.S = type { i32, i32, i8 }
+
+; unsigned copy_noalias(S* __restrict a, S *b, int n) {
+;   for (int i = 0; i < n; i++) {
+;     a[i] = b[i];
+;   }
+;   return sizeof(a[0]);
+; }
+
+; Function Attrs: nofree nounwind uwtable mustprogress
+define dso_local i32 @copy_noalias(%struct.S* noalias nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: @copy_noalias(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A1:%.*]] = bitcast %struct.S* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B2:%.*]] = bitcast %struct.S* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i64 [[TMP0]], 12
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A1]], i8* align 4 [[B2]], i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret i32 12
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.S* [[ARRAYIDX2]] to i8*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast %struct.S* [[ARRAYIDX]] to i8*
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret i32 12
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %idxprom = zext i32 %i.08 to i64
+  %arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom
+  %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom
+  %0 = bitcast %struct.S* %arrayidx2 to i8*
+  %1 = bitcast %struct.S* %arrayidx to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %0, i8* nonnull align 4 dereferenceable(12) %1, i64 12, i1 false)
+  %inc = add nuw nsw i32 %i.08, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+; unsigned copy_may_alias(S *a, S *b, int n) {
+;   for (int i = 0; i < n; i++) {
+;     a[i] = b[i];
+;   }
+;   return sizeof(a[0]);
+; }
+
+; Function Attrs: nofree nounwind uwtable mustprogress
+define dso_local i32 @copy_may_alias(%struct.S* nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: @copy_may_alias(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret i32 12
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[B:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[A:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.S* [[ARRAYIDX2]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.S* [[ARRAYIDX]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) [[TMP0]], i8* nonnull align 4 dereferenceable(12) [[TMP1]], i64 12, i1 false)
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret i32 12
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %idxprom = zext i32 %i.08 to i64
+  %arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom
+  %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom
+  %0 = bitcast %struct.S* %arrayidx2 to i8*
+  %1 = bitcast %struct.S* %arrayidx to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %0, i8* nonnull align 4 dereferenceable(12) %1, i64 12, i1 false)
+  %inc = add nuw nsw i32 %i.08, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+%struct.R = type <{ i8, i32, i8 }>
+
+; void copy_noalias_read(S* __restrict x, S* __restrict y, int n, int &s) {
+;   for (int i = 0; i < n; i++) {
+;     x[i] = y[i];
+;     s += y[i].b;
+;   }
+; }
+
+; Function Attrs: nofree nounwind uwtable mustprogress
+define dso_local void @copy_noalias_read(%struct.R* noalias nocapture %x, %struct.R* noalias nocapture readonly %y, i32 %n, i32* nocapture nonnull align 4 dereferenceable(4) %s) local_unnamed_addr #0 {
+; CHECK-LABEL: @copy_noalias_read(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X1:%.*]] = bitcast %struct.R* [[X:%.*]] to i8*
+; CHECK-NEXT:    [[Y2:%.*]] = bitcast %struct.R* [[Y:%.*]] to i8*
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[S_PROMOTED:%.*]] = load i32, i32* [[S:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i64 [[TMP0]], 6
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[X1]], i8* align 1 [[Y2]], i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.for.cond.cleanup_crit_edge:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA]], i32* [[S]], align 4
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[ADD13:%.*]] = phi i32 [ [[S_PROMOTED]], [[FOR_BODY_LR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_012:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_012]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_R:%.*]], %struct.R* [[X]], i64 [[IDXPROM]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_R]], %struct.R* [[Y]], i64 [[IDXPROM]], i32 0
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_R]], %struct.R* [[Y]], i64 [[IDXPROM]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[B]], align 1
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[ADD13]], [[TMP4]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_012]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]]
+;
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %s.promoted = load i32, i32* %s, align 4
+  br label %for.body
+
+for.cond.for.cond.cleanup_crit_edge:              ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  store i32 %add.lcssa, i32* %s, align 4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %add13 = phi i32 [ %s.promoted, %for.body.lr.ph ], [ %add, %for.body ]
+  %i.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %idxprom = zext i32 %i.012 to i64
+  %0 = getelementptr inbounds %struct.R, %struct.R* %x, i64 %idxprom, i32 0
+  %1 = getelementptr inbounds %struct.R, %struct.R* %y, i64 %idxprom, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(6) %0, i8* nonnull align 1 dereferenceable(6) %1, i64 6, i1 false)
+  %b = getelementptr inbounds %struct.R, %struct.R* %y, i64 %idxprom, i32 1
+  %2 = load i32, i32* %b, align 1
+  %add = add nsw i32 %add13, %2
+  %inc = add nuw nsw i32 %i.012, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.cond.for.cond.cleanup_crit_edge
+}
+
+%struct.SPacked = type <{ i32, i32, i8 }>
+
+; Function Attrs: nofree nounwind uwtable mustprogress
+define dso_local i32 @copy_noalias_packed(%struct.SPacked* noalias nocapture %a, %struct.SPacked* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: @copy_noalias_packed(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A1:%.*]] = bitcast %struct.SPacked* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B2:%.*]] = bitcast %struct.SPacked* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i64 [[TMP0]], 9
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[A1]], i8* align 1 [[B2]], i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret i32 9
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_SPACKED:%.*]], %struct.SPacked* [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_SPACKED]], %struct.SPacked* [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.SPacked* [[ARRAYIDX2]] to i8*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast %struct.SPacked* [[ARRAYIDX]] to i8*
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret i32 9
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %idxprom = zext i32 %i.08 to i64
+  %arrayidx = getelementptr inbounds %struct.SPacked, %struct.SPacked* %b, i64 %idxprom
+  %arrayidx2 = getelementptr inbounds %struct.SPacked, %struct.SPacked* %a, i64 %idxprom
+  %0 = bitcast %struct.SPacked* %arrayidx2 to i8*
+  %1 = bitcast %struct.SPacked* %arrayidx to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(9) %0, i8* nonnull align 1 dereferenceable(9) %1, i64 9, i1 false)
+  %inc = add nuw nsw i32 %i.08, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+%struct.SAligned = type { i32, i32, i8, [7 x i8] }
+
+define dso_local i32 @copy_noalias_aligned(%struct.SAligned* noalias nocapture %a, %struct.SAligned* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: @copy_noalias_aligned(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A1:%.*]] = bitcast %struct.SAligned* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B2:%.*]] = bitcast %struct.SAligned* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A1]], i8* align 16 [[B2]], i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret i32 16
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_SALIGNED:%.*]], %struct.SAligned* [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_SALIGNED]], %struct.SAligned* [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.SAligned* [[ARRAYIDX2]] to i8*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast %struct.SAligned* [[ARRAYIDX]] to i8*
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret i32 16
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %idxprom = zext i32 %i.08 to i64
+  %arrayidx = getelementptr inbounds %struct.SAligned, %struct.SAligned* %b, i64 %idxprom
+  %arrayidx2 = getelementptr inbounds %struct.SAligned, %struct.SAligned* %a, i64 %idxprom
+  %0 = bitcast %struct.SAligned* %arrayidx2 to i8*
+  %1 = bitcast %struct.SAligned* %arrayidx to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 16 dereferenceable(16) %0, i8* nonnull align 16 dereferenceable(16) %1, i64 16, i1 false)
+  %inc = add nuw nsw i32 %i.08, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
diff --git a/llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll b/llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll
index 06e17fe..b7a866f 100644
--- a/llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll
+++ b/llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ;     *begin = value;
 ; }
 
-; CHECK: remark: <stdin>:4:1: Transformed loop-strided store into a call to llvm.memset.p0i8.i64() function
+; CHECK: remark: <stdin>:4:1: Transformed loop-strided store in _Z15my_basic_memsetPcS_c function into a call to llvm.memset.p0i8.i64() intrinsic
 
 define void @_Z15my_basic_memsetPcS_c(i8* %ptr, i8* %end, i8 %value) {
 ; CHECK-LABEL: @_Z15my_basic_memsetPcS_c(
-- 
2.7.4