"mgpuMemFree",
llvmVoidType,
{llvmPointerType /* void *ptr */, llvmPointerType /* void *stream */}};
+ FunctionCallBuilder memcpyCallBuilder = {
+ "mgpuMemcpy",
+ llvmVoidType,
+ {llvmPointerType /* void *dst */, llvmPointerType /* void *src */,
+ llvmIntPtrType /* intptr_t sizeBytes */,
+ llvmPointerType /* void *stream */}};
};
/// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
return success();
}
};
+
+/// A rewrite pattern to convert gpu.memcpy operations into a GPU runtime
+/// call. Currently it supports CUDA and ROCm (HIP).
+class ConvertMemcpyOpToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp> {
+public:
+ ConvertMemcpyOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+ : ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp>(typeConverter) {}
+
+private:
+ LogicalResult
+ matchAndRewrite(gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands,
+ ConversionPatternRewriter &rewriter) const override;
+};
} // namespace
void GpuToLLVMConversionPass::runOnOperation() {
return success();
}
+LogicalResult ConvertMemcpyOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands,
+ ConversionPatternRewriter &rewriter) const {
+ auto memRefType = memcpyOp.src().getType().cast<MemRefType>();
+
+ if (failed(areAllLLVMTypes(memcpyOp, operands, rewriter)) ||
+ !isSupportedMemRefType(memRefType) ||
+ failed(isAsyncWithOneDependency(rewriter, memcpyOp)))
+ return failure();
+
+ auto loc = memcpyOp.getLoc();
+ auto adaptor = gpu::MemcpyOpAdaptor(operands, memcpyOp->getAttrDictionary());
+
+ MemRefDescriptor srcDesc(adaptor.src());
+
+ Value numElements =
+ memRefType.hasStaticShape()
+ ? createIndexConstant(rewriter, loc, memRefType.getNumElements())
+ // For identity layouts (verified above), the number of elements is
+ // stride[0] * size[0].
+ : rewriter.create<LLVM::MulOp>(loc, srcDesc.stride(rewriter, loc, 0),
+ srcDesc.size(rewriter, loc, 0));
+
+ Type elementPtrType = getElementPtrType(memRefType);
+ Value nullPtr = rewriter.create<LLVM::NullOp>(loc, elementPtrType);
+ Value gepPtr = rewriter.create<LLVM::GEPOp>(
+ loc, elementPtrType, ArrayRef<Value>{nullPtr, numElements});
+ auto sizeBytes =
+ rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr);
+
+ auto src = rewriter.create<LLVM::BitcastOp>(
+ loc, llvmPointerType, srcDesc.alignedPtr(rewriter, loc));
+ auto dst = rewriter.create<LLVM::BitcastOp>(
+ loc, llvmPointerType,
+ MemRefDescriptor(adaptor.dst()).alignedPtr(rewriter, loc));
+
+ auto stream = adaptor.asyncDependencies().front();
+ memcpyCallBuilder.create(loc, rewriter, {dst, src, sizeBytes, stream});
+
+ rewriter.replaceOp(memcpyOp, {stream});
+
+ return success();
+}
+
std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
mlir::createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) {
return std::make_unique<GpuToLLVMConversionPass>(gpuBinaryAnnotation);
patterns.insert<ConvertAllocOpToGpuRuntimeCallPattern,
ConvertDeallocOpToGpuRuntimeCallPattern,
ConvertHostRegisterOpToGpuRuntimeCallPattern,
+ ConvertMemcpyOpToGpuRuntimeCallPattern,
ConvertWaitAsyncOpToGpuRuntimeCallPattern,
ConvertWaitOpToGpuRuntimeCallPattern>(converter);
patterns.insert<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
--- /dev/null
+// RUN: mlir-opt -allow-unregistered-dialect %s --gpu-to-llvm | FileCheck %s
+
+module attributes {gpu.container_module} {
+
+ // CHECK: func @foo
+ func @foo(%dst : memref<7xf32, 1>, %src : memref<7xf32>) {
+ // CHECK: %[[t0:.*]] = llvm.call @mgpuStreamCreate
+ %t0 = gpu.wait async
+ // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint
+ // CHECK: %[[src:.*]] = llvm.bitcast
+ // CHECK: %[[dst:.*]] = llvm.bitcast
+ // CHECK: llvm.call @mgpuMemcpy(%[[dst]], %[[src]], %[[size_bytes]], %[[t0]])
+ %t1 = gpu.memcpy async [%t0] %dst, %src : memref<7xf32, 1>, memref<7xf32>
+ // CHECK: llvm.call @mgpuStreamSynchronize(%[[t0]])
+ // CHECK: llvm.call @mgpuStreamDestroy(%[[t0]])
+ gpu.wait [%t1]
+ return
+ }
+}