LowerMemIntrinsics: Handle inserting addrspacecast for memmove lowering

author Matt Arsenault <Matthew.Arsenault@amd.com>

Sat, 10 Jun 2023 16:20:43 +0000 (12:20 -0400)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Tue, 13 Jun 2023 01:10:30 +0000 (21:10 -0400)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Sat, 10 Jun 2023 16:20:43 +0000 (12:20 -0400)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Tue, 13 Jun 2023 01:10:30 +0000 (21:10 -0400)
diff --git a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h

index 4bf4df0..3144353 100644 (file)
--- a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
+++ b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
@@ -52,7 +52,7 @@ void expandMemCpyAsLoop(MemCpyInst *MemCpy, const TargetTransformInfo &TTI,
  
  /// Expand \p MemMove as a loop. \p MemMove is not deleted. Returns true if the
  /// memmove was lowered.
-bool expandMemMoveAsLoop(MemMoveInst *MemMove);
+bool expandMemMoveAsLoop(MemMoveInst *MemMove, const TargetTransformInfo &TTI);
  
  /// Expand \p MemSet as a loop. \p MemSet is not deleted.
  void expandMemSetAsLoop(MemSetInst *MemSet);
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp

index 0940c34..3448c56 100644 (file)
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -222,7 +222,7 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
              LookupLibInfo(*ParentFunc).has(LibFunc_memmove))
            break;
  
-        if (expandMemMoveAsLoop(Memmove)) {
+        if (expandMemMoveAsLoop(Memmove, TTI)) {
            Changed = true;
            Memmove->eraseFromParent();
          }
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp

index f57c292..6ee4b16 100644 (file)
--- a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -127,7 +127,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
      if (MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(MemCall)) {
        expandMemCpyAsLoop(Memcpy, TTI);
      } else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) {
-      expandMemMoveAsLoop(Memmove);
+      expandMemMoveAsLoop(Memmove, TTI);
      } else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) {
        expandMemSetAsLoop(Memset);
      }
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp

index df15ec7..d06f7db 100644 (file)
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -379,7 +379,8 @@ void llvm::createMemCpyLoopUnknownSize(
  static bool createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
                                Value *DstAddr, Value *CopyLen, Align SrcAlign,
                                Align DstAlign, bool SrcIsVolatile,
-                              bool DstIsVolatile) {
+                              bool DstIsVolatile,
+                              const TargetTransformInfo &TTI) {
    Type *TypeOfCopyLen = CopyLen->getType();
    BasicBlock *OrigBB = InsertBefore->getParent();
    Function *F = OrigBB->getParent();
@@ -389,14 +390,24 @@ static bool createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
    IRBuilder<> CastBuilder(InsertBefore);
    Type *EltTy = CastBuilder.getInt8Ty();
  
-  // FIXME: We don't know generically if it's legal to introduce an
-  // addrspacecast. We need to know either if it's legal to insert an
-  // addrspacecast, or if the address spaces cannot alias.
-  if (SrcAddr->getType()->getPointerAddressSpace() !=
-      DstAddr->getType()->getPointerAddressSpace()) {
-    LLVM_DEBUG(dbgs() << "Do not know how to expand memmove between different "
-                         "address spaces\n");
-    return false;
+  unsigned SrcAS = SrcAddr->getType()->getPointerAddressSpace();
+  unsigned DstAS = DstAddr->getType()->getPointerAddressSpace();
+  if (SrcAS != DstAS) {
+    if (TTI.isValidAddrSpaceCast(DstAS, SrcAS))
+      DstAddr = CastBuilder.CreateAddrSpaceCast(DstAddr, SrcAddr->getType());
+    else if (TTI.isValidAddrSpaceCast(SrcAS, DstAS))
+      SrcAddr = CastBuilder.CreateAddrSpaceCast(SrcAddr, DstAddr->getType());
+    else {
+      // We don't know generically if it's legal to introduce an
+      // addrspacecast. We need to know either if it's legal to insert an
+      // addrspacecast, or if the address spaces cannot alias.
+      //
+      // TODO: Check if address spaces cannot alias and lower as memcpy.
+      LLVM_DEBUG(
+          dbgs() << "Do not know how to expand memmove between different "
+                    "address spaces\n");
+      return false;
+    }
    }
  
    // Create the a comparison of src and dst, based on which we jump to either
@@ -562,7 +573,8 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
    }
  }
  
-bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) {
+bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
+                               const TargetTransformInfo &TTI) {
    return createMemMoveLoop(
        /* InsertBefore */ Memmove,
        /* SrcAddr */ Memmove->getRawSource(),
@@ -571,7 +583,7 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) {
        /* SrcAlign */ Memmove->getSourceAlign().valueOrOne(),
        /* DestAlign */ Memmove->getDestAlign().valueOrOne(),
        /* SrcIsVolatile */ Memmove->isVolatile(),
-      /* DstIsVolatile */ Memmove->isVolatile());
+      /* DstIsVolatile */ Memmove->isVolatile(), TTI);
  }
  
  void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll

index c2fbe2d..0573d1e 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -17,6 +17,9 @@ declare void @llvm.memmove.p5.p1.i64(ptr addrspace(5) nocapture writeonly, ptr a
  declare void @llvm.memmove.p1.p5.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #1
  declare void @llvm.memmove.p0.p5.i64(ptr nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #1
  declare void @llvm.memmove.p5.p0.i64(ptr addrspace(5) nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1
+declare void @llvm.memmove.p1.p999.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(999) nocapture readonly, i64, i1 immarg) #1
+declare void @llvm.memmove.p999.p1.i64(ptr addrspace(999) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #1
+declare void @llvm.memmove.p999.p998.i64(ptr addrspace(999) nocapture writeonly, ptr addrspace(998) nocapture readonly, i64, i1 immarg) #1
  
  declare void @llvm.memset.p1.i64(ptr addrspace(1) nocapture, i8, i64, i1) #1
  
@@ -1314,36 +1317,156 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1(ptr addrspace(1)
  }
  
  define amdgpu_kernel void @memmove_flat_align1_global_align1(ptr %dst, ptr addrspace(1) %src) {
-; OPT-LABEL: @memmove_flat_align1_global_align1(
-; OPT-NEXT:    call void @llvm.memmove.p0.p1.i64(ptr [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 256, i1 false)
-; OPT-NEXT:    ret void
+; MAX1024-LABEL: @memmove_flat_align1_global_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p0.p1.i64(ptr [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_flat_align1_global_align1(
+; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[SRC:%.*]] to ptr
+; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
+; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
+; ALL:       copy_backwards:
+; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
+; ALL:       copy_backwards_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
+; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR]]
+; ALL-NEXT:    store i8 [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
+; ALL:       copy_forward:
+; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
+; ALL:       copy_forward_loop:
+; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
+; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR1]]
+; ALL-NEXT:    store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
+; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
+; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL:       memmove_done:
+; ALL-NEXT:    ret void
  ;
    call void @llvm.memmove.p0.p1.i64(ptr %dst, ptr addrspace(1) %src, i64 256, i1 false)
    ret void
  }
  
  define amdgpu_kernel void @memmove_global_align1_flat_align1(ptr addrspace(1) %dst, ptr %src) {
-; OPT-LABEL: @memmove_global_align1_flat_align1(
-; OPT-NEXT:    call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) [[DST:%.*]], ptr [[SRC:%.*]], i64 256, i1 false)
-; OPT-NEXT:    ret void
+; MAX1024-LABEL: @memmove_global_align1_flat_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) [[DST:%.*]], ptr [[SRC:%.*]], i64 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_global_align1_flat_align1(
+; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[DST:%.*]] to ptr
+; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
+; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
+; ALL:       copy_backwards:
+; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
+; ALL:       copy_backwards_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
+; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
+; ALL-NEXT:    store i8 [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
+; ALL:       copy_forward:
+; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
+; ALL:       copy_forward_loop:
+; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR1]]
+; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
+; ALL-NEXT:    store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
+; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
+; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL:       memmove_done:
+; ALL-NEXT:    ret void
  ;
    call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) %dst, ptr %src, i64 256, i1 false)
    ret void
  }
  
  define amdgpu_kernel void @memmove_flat_align1_private_align1(ptr %dst, ptr addrspace(5) %src) {
-; OPT-LABEL: @memmove_flat_align1_private_align1(
-; OPT-NEXT:    call void @llvm.memmove.p0.p5.i64(ptr [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i64 256, i1 false)
-; OPT-NEXT:    ret void
+; MAX1024-LABEL: @memmove_flat_align1_private_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p0.p5.i64(ptr [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i64 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_flat_align1_private_align1(
+; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[SRC:%.*]] to ptr
+; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
+; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
+; ALL:       copy_backwards:
+; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
+; ALL:       copy_backwards_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
+; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR]]
+; ALL-NEXT:    store i8 [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
+; ALL:       copy_forward:
+; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
+; ALL:       copy_forward_loop:
+; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
+; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR1]]
+; ALL-NEXT:    store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
+; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
+; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL:       memmove_done:
+; ALL-NEXT:    ret void
  ;
    call void @llvm.memmove.p0.p5.i64(ptr %dst, ptr addrspace(5) %src, i64 256, i1 false)
    ret void
  }
  
  define amdgpu_kernel void @memmove_private_align1_flat_align1(ptr addrspace(5) %dst, ptr %src) {
-; OPT-LABEL: @memmove_private_align1_flat_align1(
-; OPT-NEXT:    call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) [[DST:%.*]], ptr [[SRC:%.*]], i64 256, i1 false)
-; OPT-NEXT:    ret void
+; MAX1024-LABEL: @memmove_private_align1_flat_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) [[DST:%.*]], ptr [[SRC:%.*]], i64 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_private_align1_flat_align1(
+; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DST:%.*]] to ptr
+; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
+; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
+; ALL:       copy_backwards:
+; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
+; ALL:       copy_backwards_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
+; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
+; ALL-NEXT:    store i8 [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
+; ALL:       copy_forward:
+; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
+; ALL:       copy_forward_loop:
+; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR1]]
+; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
+; ALL-NEXT:    store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
+; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
+; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL:       memmove_done:
+; ALL-NEXT:    ret void
  ;
    call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) %dst, ptr %src, i64 256, i1 false)
    ret void
@@ -1367,5 +1490,32 @@ define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1)
    ret void
  }
  
+define amdgpu_kernel void @memmove_global_align1_p999_align1(ptr addrspace(1) %dst, ptr addrspace(999) %src, i64 %size) {
+; OPT-LABEL: @memmove_global_align1_p999_align1(
+; OPT-NEXT:    call void @llvm.memmove.p1.p999.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(999) [[SRC:%.*]], i64 [[SIZE:%.*]], i1 false)
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p1.p999.i64(ptr addrspace(1) %dst, ptr addrspace(999) %src, i64 %size, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_p999_align1_p1_align1(ptr addrspace(999) %dst, ptr addrspace(1) %src, i64 %size) {
+; OPT-LABEL: @memmove_p999_align1_p1_align1(
+; OPT-NEXT:    call void @llvm.memmove.p999.p1.i64(ptr addrspace(999) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 [[SIZE:%.*]], i1 false)
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p999.p1.i64(ptr addrspace(999) %dst, ptr addrspace(1) %src, i64 %size, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_p999_align1_p998_align1(ptr addrspace(999) %dst, ptr addrspace(998) %src, i64 %size) {
+; OPT-LABEL: @memmove_p999_align1_p998_align1(
+; OPT-NEXT:    call void @llvm.memmove.p999.p998.i64(ptr addrspace(999) [[DST:%.*]], ptr addrspace(998) [[SRC:%.*]], i64 [[SIZE:%.*]], i1 false)
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p999.p998.i64(ptr addrspace(999) %dst, ptr addrspace(998) %src, i64 %size, i1 false)
+  ret void
+}
+
  attributes #0 = { nounwind }
  attributes #1 = { argmemonly nounwind }
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Sat, 10 Jun 2023 16:20:43 +0000 (12:20 -0400)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Tue, 13 Jun 2023 01:10:30 +0000 (21:10 -0400)
llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h		patch \| blob \| history
llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp		patch \| blob \| history
llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp		patch \| blob \| history
llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll		patch \| blob \| history