[libomptarget] Add support for target memory allocators to cuda RTL

author Joseph Huber <jhuber6@vols.utk.edu>

Thu, 6 May 2021 16:42:55 +0000 (12:42 -0400)

committer Huber, Joseph <huberjn@ornl.gov>

Fri, 7 May 2021 14:27:02 +0000 (10:27 -0400)
author Joseph Huber <jhuber6@vols.utk.edu>
Thu, 6 May 2021 16:42:55 +0000 (12:42 -0400)
committer Huber, Joseph <huberjn@ornl.gov>
Fri, 7 May 2021 14:27:02 +0000 (10:27 -0400)
diff --git a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h b/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h

index 6e00728..f4b63f3 100644 (file)
--- a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h
+++ b/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h
@@ -31,7 +31,7 @@ public:
  
    /// Allocate a memory of size \p Size . \p HstPtr is used to assist the
    /// allocation.
-  virtual void *allocate(size_t Size, void *HstPtr) = 0;
+  virtual void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) = 0;
  
    /// Delete the pointer \p TgtPtr on the device
    virtual int free(void *TgtPtr) = 0;
@@ -133,7 +133,7 @@ class MemoryManagerTy {
  
    /// Request memory from target device
    void *allocateOnDevice(size_t Size, void *HstPtr) const {
-    return DeviceAllocator.allocate(Size, HstPtr);
+    return DeviceAllocator.allocate(Size, HstPtr, TARGET_ALLOC_DEVICE);
    }
  
    /// Deallocate data on device
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp

index 7834f77..e8fe637 100644 (file)
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -17,6 +17,7 @@
  #include <memory>
  #include <mutex>
  #include <string>
+#include <unordered_map>
  #include <vector>
  
  #include "Debug.h"
@@ -297,12 +298,13 @@ class DeviceRTLTy {
    class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
      const int DeviceId;
      const std::vector<DeviceDataTy> &DeviceData;
+    std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
  
    public:
      CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
          : DeviceId(DeviceId), DeviceData(DeviceData) {}
  
-    void *allocate(size_t Size, void *) override {
+    void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
        if (Size == 0)
          return nullptr;
  
@@ -310,12 +312,34 @@ class DeviceRTLTy {
        if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
          return nullptr;
  
-      CUdeviceptr DevicePtr;
-      Err = cuMemAlloc(&DevicePtr, Size);
-      if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
-        return nullptr;
+      void *MemAlloc = nullptr;
+      switch (Kind) {
+      case TARGET_ALLOC_DEFAULT:
+      case TARGET_ALLOC_DEVICE:
+        CUdeviceptr DevicePtr;
+        Err = cuMemAlloc(&DevicePtr, Size);
+        MemAlloc = (void *)DevicePtr;
+        if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
+          return nullptr;
+        break;
+      case TARGET_ALLOC_HOST:
+        void *HostPtr;
+        Err = cuMemAllocHost(&HostPtr, Size);
+        MemAlloc = HostPtr;
+        if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
+          return nullptr;
+        HostPinnedAllocs[MemAlloc] = Kind;
+        break;
+      case TARGET_ALLOC_SHARED:
+        CUdeviceptr SharedPtr;
+        Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
+        MemAlloc = (void *)SharedPtr;
+        if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
+          return nullptr;
+        break;
+      }
  
-      return (void *)DevicePtr;
+      return MemAlloc;
      }
  
      int free(void *TgtPtr) override {
@@ -323,9 +347,25 @@ class DeviceRTLTy {
        if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
          return OFFLOAD_FAIL;
  
-      Err = cuMemFree((CUdeviceptr)TgtPtr);
-      if (!checkResult(Err, "Error returned from cuMemFree\n"))
-        return OFFLOAD_FAIL;
+      // Host pinned memory must be freed differently.
+      TargetAllocTy Kind =
+          (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
+              ? TARGET_ALLOC_DEFAULT
+              : TARGET_ALLOC_HOST;
+      switch (Kind) {
+      case TARGET_ALLOC_DEFAULT:
+      case TARGET_ALLOC_DEVICE:
+      case TARGET_ALLOC_SHARED:
+        Err = cuMemFree((CUdeviceptr)TgtPtr);
+        if (!checkResult(Err, "Error returned from cuMemFree\n"))
+          return OFFLOAD_FAIL;
+        break;
+      case TARGET_ALLOC_HOST:
+        Err = cuMemFreeHost(TgtPtr);
+        if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
+          return OFFLOAD_FAIL;
+        break;
+      }
  
        return OFFLOAD_SUCCESS;
      }
@@ -804,11 +844,24 @@ public:
      return getOffloadEntriesTable(DeviceId);
    }
  
-  void *dataAlloc(const int DeviceId, const int64_t Size) {
-    if (UseMemoryManager)
-      return MemoryManagers[DeviceId]->allocate(Size, nullptr);
+  void *dataAlloc(const int DeviceId, const int64_t Size,
+                  const TargetAllocTy Kind) {
+    switch (Kind) {
+    case TARGET_ALLOC_DEFAULT:
+    case TARGET_ALLOC_DEVICE:
+      if (UseMemoryManager)
+        return MemoryManagers[DeviceId]->allocate(Size, nullptr);
+      else
+        return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
+    case TARGET_ALLOC_HOST:
+    case TARGET_ALLOC_SHARED:
+      return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
+    }
  
-    return DeviceAllocators[DeviceId].allocate(Size, nullptr);
+    REPORT("Invalid target data allocation kind or requested allocator not "
+           "implemented yet\n");
+
+    return nullptr;
    }
  
    int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
@@ -1097,13 +1150,7 @@ void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
                             int32_t kind) {
    assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
  
-  if (kind != TARGET_ALLOC_DEFAULT) {
-    REPORT("Invalid target data allocation kind or requested allocator not "
-           "implemented yet\n");
-    return NULL;
-  }
-
-  return DeviceRTL.dataAlloc(device_id, size);
+  return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
  }
  
  int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
diff --git a/openmp/libomptarget/test/api/omp_device_managed_memory.c b/openmp/libomptarget/test/api/omp_device_managed_memory.c

new file mode 100644 (file)

index 0000000..e3114d8
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_device_managed_memory.c
@@ -0,0 +1,29 @@
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+// REQUIRES: nvptx64-nvidia-cuda
+
+#include <omp.h>
+#include <stdio.h>
+
+void *llvm_omp_target_alloc_shared(size_t, int);
+
+int main() {
+  const int N = 64;
+  const int device = omp_get_default_device();
+
+  int *shared_ptr = llvm_omp_target_alloc_shared(N * sizeof(int), device);
+
+#pragma omp target teams distribute parallel for device(device) \
+            is_device_ptr(shared_ptr)
+  for (int i = 0; i < N; ++i) {
+    shared_ptr[i] = 1;
+  }
+
+  int sum = 0;
+  for (int i = 0; i < N; ++i)
+    sum += shared_ptr[i];
+
+  omp_target_free(shared_ptr, device);
+  // CHECK: PASS
+  if (sum == N)
+    printf ("PASS\n");
+}
diff --git a/openmp/libomptarget/test/api/omp_host_pinned_memory.c b/openmp/libomptarget/test/api/omp_host_pinned_memory.c

new file mode 100644 (file)

index 0000000..63f3118
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_host_pinned_memory.c
@@ -0,0 +1,33 @@
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+// REQUIRES: nvptx64-nvidia-cuda
+
+#include <omp.h>
+#include <stdio.h>
+
+// Allocate pinned memory on the host
+void *llvm_omp_target_alloc_host(size_t, int);
+
+int main() {
+  const int N = 64;
+  const int device = omp_get_default_device();
+  const int host = omp_get_initial_device();
+
+  int *hst_ptr = llvm_omp_target_alloc_host(N * sizeof(int), device);
+
+  for (int i = 0; i < N; ++i)
+    hst_ptr[i] = 2;
+
+#pragma omp target teams distribute parallel for device(device) \
+           map(tofrom:hst_ptr[0 : N])
+  for (int i = 0; i < N; ++i)
+    hst_ptr[i] -= 1;
+
+  int sum = 0;
+  for (int i = 0; i < N; ++i)
+    sum += hst_ptr[i];
+
+  omp_target_free(hst_ptr, device);
+  // CHECK: PASS
+  if (sum == N)
+    printf ("PASS\n");
+}
author	Joseph Huber <jhuber6@vols.utk.edu>
	Thu, 6 May 2021 16:42:55 +0000 (12:42 -0400)
committer	Huber, Joseph <huberjn@ornl.gov>
	Fri, 7 May 2021 14:27:02 +0000 (10:27 -0400)
openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h		patch \| blob \| history
openmp/libomptarget/plugins/cuda/src/rtl.cpp		patch \| blob \| history
openmp/libomptarget/test/api/omp_device_managed_memory.c	[new file with mode: 0644]	patch \| blob
openmp/libomptarget/test/api/omp_host_pinned_memory.c	[new file with mode: 0644]	patch \| blob