Add gpu::HostUnregisterOp

author max <maksim.levental@gmail.com>

Thu, 6 Apr 2023 20:07:12 +0000 (15:07 -0500)

committer max <maksim.levental@gmail.com>

Thu, 6 Apr 2023 20:07:12 +0000 (15:07 -0500)
author max <maksim.levental@gmail.com>
Thu, 6 Apr 2023 20:07:12 +0000 (15:07 -0500)
committer max <maksim.levental@gmail.com>
Thu, 6 Apr 2023 20:07:12 +0000 (15:07 -0500)
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

index 32ab246..860e207 100644 (file)
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -929,6 +929,19 @@ def GPU_HostRegisterOp : GPU_Op<"host_register">,
    let assemblyFormat = "$value attr-dict `:` type($value)";
  }
  
+def GPU_HostUnregisterOp : GPU_Op<"host_unregister">,
+    Arguments<(ins AnyUnrankedMemRef:$value)> {
+  let summary = "Unregisters a memref for access from device.";
+  let description = [{
+      This op unmaps the provided host buffer from the device address space.
+
+      This operation may not be supported in every environment, there is not yet a
+          way to check at runtime whether this feature is supported.
+  }];
+
+  let assemblyFormat = "$value attr-dict `:` type($value)";
+}
+
  def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> {
    let summary = "Wait for async gpu ops to complete.";
    let description = [{
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp

index 55a5e46..3687bd6 100644 (file)
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -161,6 +161,12 @@ protected:
        {llvmIntPtrType /* intptr_t rank */,
         llvmPointerType /* void *memrefDesc */,
         llvmIntPtrType /* intptr_t elementSizeBytes */}};
+  FunctionCallBuilder hostUnregisterCallBuilder = {
+      "mgpuMemHostUnregisterMemRef",
+      llvmVoidType,
+      {llvmIntPtrType /* intptr_t rank */,
+       llvmPointerType /* void *memrefDesc */,
+       llvmIntPtrType /* intptr_t elementSizeBytes */}};
    FunctionCallBuilder allocCallBuilder = {
        "mgpuMemAlloc",
        llvmPointerType /* void * */,
@@ -202,6 +208,20 @@ private:
                    ConversionPatternRewriter &rewriter) const override;
  };
  
+class ConvertHostUnregisterOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::HostUnregisterOp> {
+public:
+  ConvertHostUnregisterOpToGpuRuntimeCallPattern(
+      LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::HostUnregisterOp>(typeConverter) {
+  }
+
+private:
+  LogicalResult
+  matchAndRewrite(gpu::HostUnregisterOp hostUnregisterOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
  /// A rewrite pattern to convert gpu.alloc operations into a GPU runtime
  /// call. Currently it supports CUDA and ROCm (HIP).
  class ConvertAllocOpToGpuRuntimeCallPattern
@@ -446,6 +466,28 @@ LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite(
    return success();
  }
  
+LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::HostUnregisterOp hostUnregisterOp, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  Operation *op = hostUnregisterOp.getOperation();
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)))
+    return failure();
+
+  Location loc = op->getLoc();
+
+  auto memRefType = hostUnregisterOp.getValue().getType();
+  auto elementType = memRefType.cast<UnrankedMemRefType>().getElementType();
+  auto elementSize = getSizeInBytes(loc, elementType, rewriter);
+
+  auto arguments = getTypeConverter()->promoteOperands(
+      loc, op->getOperands(), adaptor.getOperands(), rewriter);
+  arguments.push_back(elementSize);
+  hostUnregisterCallBuilder.create(loc, rewriter, arguments);
+
+  rewriter.eraseOp(op);
+  return success();
+}
+
  LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
      gpu::AllocOp allocOp, OpAdaptor adaptor,
      ConversionPatternRewriter &rewriter) const {
@@ -928,6 +970,7 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
    patterns.add<ConvertAllocOpToGpuRuntimeCallPattern,
                 ConvertDeallocOpToGpuRuntimeCallPattern,
                 ConvertHostRegisterOpToGpuRuntimeCallPattern,
+               ConvertHostUnregisterOpToGpuRuntimeCallPattern,
                 ConvertMemcpyOpToGpuRuntimeCallPattern,
                 ConvertMemsetOpToGpuRuntimeCallPattern,
                 ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern,
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp

index 44ed5b0..4065c65 100644 (file)
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -192,6 +192,22 @@ mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
    mgpuMemHostRegister(ptr, sizeBytes);
  }
  
+// Allows to unregister byte array with the CUDA runtime.
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemHostUnregister(void *ptr) {
+  ScopedContext scopedContext;
+  CUDA_REPORT_IF_ERROR(cuMemHostUnregister(ptr));
+}
+
+/// Unregisters a memref with the CUDA runtime. `descriptor` is a pointer to a
+/// ranked memref descriptor struct of rank `rank`
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuMemHostUnregisterMemRef(int64_t rank,
+                            StridedMemRefType<char, 1> *descriptor,
+                            int64_t elementSizeBytes) {
+  auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes;
+  mgpuMemHostUnregister(ptr);
+}
+
  extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
    defaultDevice = device;
  }
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp

index 43a7e3c..bd3868a 100644 (file)
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -152,6 +152,22 @@ mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
    mgpuMemHostRegister(ptr, sizeBytes);
  }
  
+// Allows to unregister byte array with the ROCM runtime. Helpful until we have
+// transfer functions implemented.
+extern "C" void mgpuMemHostUnregister(void *ptr) {
+  HIP_REPORT_IF_ERROR(hipHostUnregister(ptr));
+}
+
+// Allows to unregister a MemRef with the ROCm runtime. Helpful until we have
+// transfer functions implemented.
+extern "C" void
+mgpuMemHostUnregisterMemRef(int64_t rank,
+                            StridedMemRefType<char, 1> *descriptor,
+                            int64_t elementSizeBytes) {
+  auto ptr = descriptor->data + descriptor->offset * elementSizeBytes;
+  mgpuMemHostUnregister(ptr);
+}
+
  template <typename T>
  void mgpuMemGetDevicePointer(T *hostPtr, T **devicePtr) {
    HIP_REPORT_IF_ERROR(hipSetDevice(0));
author	max <maksim.levental@gmail.com>
	Thu, 6 Apr 2023 20:07:12 +0000 (15:07 -0500)
committer	max <maksim.levental@gmail.com>
	Thu, 6 Apr 2023 20:07:12 +0000 (15:07 -0500)
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td		patch \| blob \| history
mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp		patch \| blob \| history
mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp		patch \| blob \| history
mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp		patch \| blob \| history