From 8f7c8a6ea765139225878e1dfe90bc1eb6f0067c Mon Sep 17 00:00:00 2001 From: max Date: Thu, 6 Apr 2023 15:07:12 -0500 Subject: [PATCH] Add gpu::HostUnregisterOp Without explicitly unregistering you will get ``` 'cuMemHostRegister(ptr, sizeBytes, 0)' failed with 'CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED' ``` in CUDA (for example) after repeated runs (e.g., during benchmarking the same kernel). Reviewed By: ftynse Differential Revision: https://reviews.llvm.org/D147277 --- mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 13 +++++++ .../Conversion/GPUCommon/GPUToLLVMConversion.cpp | 43 ++++++++++++++++++++++ mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp | 16 ++++++++ mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp | 16 ++++++++ 4 files changed, 88 insertions(+) diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index 32ab246..860e207 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -929,6 +929,19 @@ def GPU_HostRegisterOp : GPU_Op<"host_register">, let assemblyFormat = "$value attr-dict `:` type($value)"; } +def GPU_HostUnregisterOp : GPU_Op<"host_unregister">, + Arguments<(ins AnyUnrankedMemRef:$value)> { + let summary = "Unregisters a memref for access from device."; + let description = [{ + This op unmaps the provided host buffer from the device address space. + + This operation may not be supported in every environment, there is not yet a + way to check at runtime whether this feature is supported. + }]; + + let assemblyFormat = "$value attr-dict `:` type($value)"; +} + def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> { let summary = "Wait for async gpu ops to complete."; let description = [{ diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index 55a5e46..3687bd6 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -161,6 +161,12 @@ protected: {llvmIntPtrType /* intptr_t rank */, llvmPointerType /* void *memrefDesc */, llvmIntPtrType /* intptr_t elementSizeBytes */}}; + FunctionCallBuilder hostUnregisterCallBuilder = { + "mgpuMemHostUnregisterMemRef", + llvmVoidType, + {llvmIntPtrType /* intptr_t rank */, + llvmPointerType /* void *memrefDesc */, + llvmIntPtrType /* intptr_t elementSizeBytes */}}; FunctionCallBuilder allocCallBuilder = { "mgpuMemAlloc", llvmPointerType /* void * */, @@ -202,6 +208,20 @@ private: ConversionPatternRewriter &rewriter) const override; }; +class ConvertHostUnregisterOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertHostUnregisterOpToGpuRuntimeCallPattern( + LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) { + } + +private: + LogicalResult + matchAndRewrite(gpu::HostUnregisterOp hostUnregisterOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + /// A rewrite pattern to convert gpu.alloc operations into a GPU runtime /// call. Currently it supports CUDA and ROCm (HIP). class ConvertAllocOpToGpuRuntimeCallPattern @@ -446,6 +466,28 @@ LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite( return success(); } +LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::HostUnregisterOp hostUnregisterOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + Operation *op = hostUnregisterOp.getOperation(); + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter))) + return failure(); + + Location loc = op->getLoc(); + + auto memRefType = hostUnregisterOp.getValue().getType(); + auto elementType = memRefType.cast().getElementType(); + auto elementSize = getSizeInBytes(loc, elementType, rewriter); + + auto arguments = getTypeConverter()->promoteOperands( + loc, op->getOperands(), adaptor.getOperands(), rewriter); + arguments.push_back(elementSize); + hostUnregisterCallBuilder.create(loc, rewriter, arguments); + + rewriter.eraseOp(op); + return success(); +} + LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::AllocOp allocOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { @@ -928,6 +970,7 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter, patterns.add *descriptor, mgpuMemHostRegister(ptr, sizeBytes); } +// Allows to unregister byte array with the CUDA runtime. +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemHostUnregister(void *ptr) { + ScopedContext scopedContext; + CUDA_REPORT_IF_ERROR(cuMemHostUnregister(ptr)); +} + +/// Unregisters a memref with the CUDA runtime. `descriptor` is a pointer to a +/// ranked memref descriptor struct of rank `rank` +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuMemHostUnregisterMemRef(int64_t rank, + StridedMemRefType *descriptor, + int64_t elementSizeBytes) { + auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes; + mgpuMemHostUnregister(ptr); +} + extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) { defaultDevice = device; } diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp index 43a7e3c..bd3868a 100644 --- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp @@ -152,6 +152,22 @@ mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType *descriptor, mgpuMemHostRegister(ptr, sizeBytes); } +// Allows to unregister byte array with the ROCM runtime. Helpful until we have +// transfer functions implemented. +extern "C" void mgpuMemHostUnregister(void *ptr) { + HIP_REPORT_IF_ERROR(hipHostUnregister(ptr)); +} + +// Allows to unregister a MemRef with the ROCm runtime. Helpful until we have +// transfer functions implemented. +extern "C" void +mgpuMemHostUnregisterMemRef(int64_t rank, + StridedMemRefType *descriptor, + int64_t elementSizeBytes) { + auto ptr = descriptor->data + descriptor->offset * elementSizeBytes; + mgpuMemHostUnregister(ptr); +} + template void mgpuMemGetDevicePointer(T *hostPtr, T **devicePtr) { HIP_REPORT_IF_ERROR(hipSetDevice(0)); -- 2.7.4