From ee19fabc984747b0ce971d1d47662d89b63fa0ab Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 10 Jun 2023 12:20:43 -0400 Subject: [PATCH] LowerMemIntrinsics: Handle inserting addrspacecast for memmove lowering We're missing a trivial non-AA way to check for non-aliasing address spaces. --- .../llvm/Transforms/Utils/LowerMemIntrinsics.h | 2 +- llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp | 2 +- llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp | 2 +- llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp | 34 ++-- llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll | 174 +++++++++++++++++++-- 5 files changed, 188 insertions(+), 26 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h index 4bf4df0..3144353 100644 --- a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h +++ b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h @@ -52,7 +52,7 @@ void expandMemCpyAsLoop(MemCpyInst *MemCpy, const TargetTransformInfo &TTI, /// Expand \p MemMove as a loop. \p MemMove is not deleted. Returns true if the /// memmove was lowered. -bool expandMemMoveAsLoop(MemMoveInst *MemMove); +bool expandMemMoveAsLoop(MemMoveInst *MemMove, const TargetTransformInfo &TTI); /// Expand \p MemSet as a loop. \p MemSet is not deleted. void expandMemSetAsLoop(MemSetInst *MemSet); diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index 0940c34..3448c56 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -222,7 +222,7 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const { LookupLibInfo(*ParentFunc).has(LibFunc_memmove)) break; - if (expandMemMoveAsLoop(Memmove)) { + if (expandMemMoveAsLoop(Memmove, TTI)) { Changed = true; Memmove->eraseFromParent(); } diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp index f57c292..6ee4b16 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -127,7 +127,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { if (MemCpyInst *Memcpy = dyn_cast(MemCall)) { expandMemCpyAsLoop(Memcpy, TTI); } else if (MemMoveInst *Memmove = dyn_cast(MemCall)) { - expandMemMoveAsLoop(Memmove); + expandMemMoveAsLoop(Memmove, TTI); } else if (MemSetInst *Memset = dyn_cast(MemCall)) { expandMemSetAsLoop(Memset); } diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp index df15ec7..d06f7db 100644 --- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -379,7 +379,8 @@ void llvm::createMemCpyLoopUnknownSize( static bool createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen, Align SrcAlign, Align DstAlign, bool SrcIsVolatile, - bool DstIsVolatile) { + bool DstIsVolatile, + const TargetTransformInfo &TTI) { Type *TypeOfCopyLen = CopyLen->getType(); BasicBlock *OrigBB = InsertBefore->getParent(); Function *F = OrigBB->getParent(); @@ -389,14 +390,24 @@ static bool createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr, IRBuilder<> CastBuilder(InsertBefore); Type *EltTy = CastBuilder.getInt8Ty(); - // FIXME: We don't know generically if it's legal to introduce an - // addrspacecast. We need to know either if it's legal to insert an - // addrspacecast, or if the address spaces cannot alias. - if (SrcAddr->getType()->getPointerAddressSpace() != - DstAddr->getType()->getPointerAddressSpace()) { - LLVM_DEBUG(dbgs() << "Do not know how to expand memmove between different " - "address spaces\n"); - return false; + unsigned SrcAS = SrcAddr->getType()->getPointerAddressSpace(); + unsigned DstAS = DstAddr->getType()->getPointerAddressSpace(); + if (SrcAS != DstAS) { + if (TTI.isValidAddrSpaceCast(DstAS, SrcAS)) + DstAddr = CastBuilder.CreateAddrSpaceCast(DstAddr, SrcAddr->getType()); + else if (TTI.isValidAddrSpaceCast(SrcAS, DstAS)) + SrcAddr = CastBuilder.CreateAddrSpaceCast(SrcAddr, DstAddr->getType()); + else { + // We don't know generically if it's legal to introduce an + // addrspacecast. We need to know either if it's legal to insert an + // addrspacecast, or if the address spaces cannot alias. + // + // TODO: Check if address spaces cannot alias and lower as memcpy. + LLVM_DEBUG( + dbgs() << "Do not know how to expand memmove between different " + "address spaces\n"); + return false; + } } // Create the a comparison of src and dst, based on which we jump to either @@ -562,7 +573,8 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy, } } -bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) { +bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove, + const TargetTransformInfo &TTI) { return createMemMoveLoop( /* InsertBefore */ Memmove, /* SrcAddr */ Memmove->getRawSource(), @@ -571,7 +583,7 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) { /* SrcAlign */ Memmove->getSourceAlign().valueOrOne(), /* DestAlign */ Memmove->getDestAlign().valueOrOne(), /* SrcIsVolatile */ Memmove->isVolatile(), - /* DstIsVolatile */ Memmove->isVolatile()); + /* DstIsVolatile */ Memmove->isVolatile(), TTI); } void llvm::expandMemSetAsLoop(MemSetInst *Memset) { diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll index c2fbe2d..0573d1e 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -17,6 +17,9 @@ declare void @llvm.memmove.p5.p1.i64(ptr addrspace(5) nocapture writeonly, ptr a declare void @llvm.memmove.p1.p5.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #1 declare void @llvm.memmove.p0.p5.i64(ptr nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #1 declare void @llvm.memmove.p5.p0.i64(ptr addrspace(5) nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1 +declare void @llvm.memmove.p1.p999.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(999) nocapture readonly, i64, i1 immarg) #1 +declare void @llvm.memmove.p999.p1.i64(ptr addrspace(999) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #1 +declare void @llvm.memmove.p999.p998.i64(ptr addrspace(999) nocapture writeonly, ptr addrspace(998) nocapture readonly, i64, i1 immarg) #1 declare void @llvm.memset.p1.i64(ptr addrspace(1) nocapture, i8, i64, i1) #1 @@ -1314,36 +1317,156 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1(ptr addrspace(1) } define amdgpu_kernel void @memmove_flat_align1_global_align1(ptr %dst, ptr addrspace(1) %src) { -; OPT-LABEL: @memmove_flat_align1_global_align1( -; OPT-NEXT: call void @llvm.memmove.p0.p1.i64(ptr [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 256, i1 false) -; OPT-NEXT: ret void +; MAX1024-LABEL: @memmove_flat_align1_global_align1( +; MAX1024-NEXT: call void @llvm.memmove.p0.p1.i64(ptr [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 256, i1 false) +; MAX1024-NEXT: ret void +; +; ALL-LABEL: @memmove_flat_align1_global_align1( +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[SRC:%.*]] to ptr +; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]] +; ALL-NEXT: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0 +; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]] +; ALL: copy_backwards: +; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]] +; ALL: copy_backwards_loop: +; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ] +; ALL-NEXT: [[INDEX_PTR]] = sub i64 [[TMP2]], 1 +; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]] +; ALL-NEXT: [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1 +; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR]] +; ALL-NEXT: store i8 [[ELEMENT]], ptr [[TMP4]], align 1 +; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0 +; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]] +; ALL: copy_forward: +; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]] +; ALL: copy_forward_loop: +; ALL-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ] +; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]] +; ALL-NEXT: [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1 +; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR1]] +; ALL-NEXT: store i8 [[ELEMENT2]], ptr [[TMP7]], align 1 +; ALL-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1 +; ALL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256 +; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]] +; ALL: memmove_done: +; ALL-NEXT: ret void ; call void @llvm.memmove.p0.p1.i64(ptr %dst, ptr addrspace(1) %src, i64 256, i1 false) ret void } define amdgpu_kernel void @memmove_global_align1_flat_align1(ptr addrspace(1) %dst, ptr %src) { -; OPT-LABEL: @memmove_global_align1_flat_align1( -; OPT-NEXT: call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) [[DST:%.*]], ptr [[SRC:%.*]], i64 256, i1 false) -; OPT-NEXT: ret void +; MAX1024-LABEL: @memmove_global_align1_flat_align1( +; MAX1024-NEXT: call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) [[DST:%.*]], ptr [[SRC:%.*]], i64 256, i1 false) +; MAX1024-NEXT: ret void +; +; ALL-LABEL: @memmove_global_align1_flat_align1( +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[DST:%.*]] to ptr +; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]] +; ALL-NEXT: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0 +; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]] +; ALL: copy_backwards: +; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]] +; ALL: copy_backwards_loop: +; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ] +; ALL-NEXT: [[INDEX_PTR]] = sub i64 [[TMP2]], 1 +; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR]] +; ALL-NEXT: [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1 +; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]] +; ALL-NEXT: store i8 [[ELEMENT]], ptr [[TMP4]], align 1 +; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0 +; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]] +; ALL: copy_forward: +; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]] +; ALL: copy_forward_loop: +; ALL-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ] +; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR1]] +; ALL-NEXT: [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1 +; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]] +; ALL-NEXT: store i8 [[ELEMENT2]], ptr [[TMP7]], align 1 +; ALL-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1 +; ALL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256 +; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]] +; ALL: memmove_done: +; ALL-NEXT: ret void ; call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) %dst, ptr %src, i64 256, i1 false) ret void } define amdgpu_kernel void @memmove_flat_align1_private_align1(ptr %dst, ptr addrspace(5) %src) { -; OPT-LABEL: @memmove_flat_align1_private_align1( -; OPT-NEXT: call void @llvm.memmove.p0.p5.i64(ptr [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i64 256, i1 false) -; OPT-NEXT: ret void +; MAX1024-LABEL: @memmove_flat_align1_private_align1( +; MAX1024-NEXT: call void @llvm.memmove.p0.p5.i64(ptr [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i64 256, i1 false) +; MAX1024-NEXT: ret void +; +; ALL-LABEL: @memmove_flat_align1_private_align1( +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[SRC:%.*]] to ptr +; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]] +; ALL-NEXT: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0 +; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]] +; ALL: copy_backwards: +; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]] +; ALL: copy_backwards_loop: +; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ] +; ALL-NEXT: [[INDEX_PTR]] = sub i64 [[TMP2]], 1 +; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]] +; ALL-NEXT: [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1 +; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR]] +; ALL-NEXT: store i8 [[ELEMENT]], ptr [[TMP4]], align 1 +; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0 +; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]] +; ALL: copy_forward: +; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]] +; ALL: copy_forward_loop: +; ALL-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ] +; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]] +; ALL-NEXT: [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1 +; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR1]] +; ALL-NEXT: store i8 [[ELEMENT2]], ptr [[TMP7]], align 1 +; ALL-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1 +; ALL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256 +; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]] +; ALL: memmove_done: +; ALL-NEXT: ret void ; call void @llvm.memmove.p0.p5.i64(ptr %dst, ptr addrspace(5) %src, i64 256, i1 false) ret void } define amdgpu_kernel void @memmove_private_align1_flat_align1(ptr addrspace(5) %dst, ptr %src) { -; OPT-LABEL: @memmove_private_align1_flat_align1( -; OPT-NEXT: call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) [[DST:%.*]], ptr [[SRC:%.*]], i64 256, i1 false) -; OPT-NEXT: ret void +; MAX1024-LABEL: @memmove_private_align1_flat_align1( +; MAX1024-NEXT: call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) [[DST:%.*]], ptr [[SRC:%.*]], i64 256, i1 false) +; MAX1024-NEXT: ret void +; +; ALL-LABEL: @memmove_private_align1_flat_align1( +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DST:%.*]] to ptr +; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]] +; ALL-NEXT: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0 +; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]] +; ALL: copy_backwards: +; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]] +; ALL: copy_backwards_loop: +; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ] +; ALL-NEXT: [[INDEX_PTR]] = sub i64 [[TMP2]], 1 +; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR]] +; ALL-NEXT: [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1 +; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]] +; ALL-NEXT: store i8 [[ELEMENT]], ptr [[TMP4]], align 1 +; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0 +; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]] +; ALL: copy_forward: +; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]] +; ALL: copy_forward_loop: +; ALL-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ] +; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR1]] +; ALL-NEXT: [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1 +; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]] +; ALL-NEXT: store i8 [[ELEMENT2]], ptr [[TMP7]], align 1 +; ALL-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1 +; ALL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256 +; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]] +; ALL: memmove_done: +; ALL-NEXT: ret void ; call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) %dst, ptr %src, i64 256, i1 false) ret void @@ -1367,5 +1490,32 @@ define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1) ret void } +define amdgpu_kernel void @memmove_global_align1_p999_align1(ptr addrspace(1) %dst, ptr addrspace(999) %src, i64 %size) { +; OPT-LABEL: @memmove_global_align1_p999_align1( +; OPT-NEXT: call void @llvm.memmove.p1.p999.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(999) [[SRC:%.*]], i64 [[SIZE:%.*]], i1 false) +; OPT-NEXT: ret void +; + call void @llvm.memmove.p1.p999.i64(ptr addrspace(1) %dst, ptr addrspace(999) %src, i64 %size, i1 false) + ret void +} + +define amdgpu_kernel void @memmove_p999_align1_p1_align1(ptr addrspace(999) %dst, ptr addrspace(1) %src, i64 %size) { +; OPT-LABEL: @memmove_p999_align1_p1_align1( +; OPT-NEXT: call void @llvm.memmove.p999.p1.i64(ptr addrspace(999) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 [[SIZE:%.*]], i1 false) +; OPT-NEXT: ret void +; + call void @llvm.memmove.p999.p1.i64(ptr addrspace(999) %dst, ptr addrspace(1) %src, i64 %size, i1 false) + ret void +} + +define amdgpu_kernel void @memmove_p999_align1_p998_align1(ptr addrspace(999) %dst, ptr addrspace(998) %src, i64 %size) { +; OPT-LABEL: @memmove_p999_align1_p998_align1( +; OPT-NEXT: call void @llvm.memmove.p999.p998.i64(ptr addrspace(999) [[DST:%.*]], ptr addrspace(998) [[SRC:%.*]], i64 [[SIZE:%.*]], i1 false) +; OPT-NEXT: ret void +; + call void @llvm.memmove.p999.p998.i64(ptr addrspace(999) %dst, ptr addrspace(998) %src, i64 %size, i1 false) + ret void +} + attributes #0 = { nounwind } attributes #1 = { argmemonly nounwind } -- 2.7.4