From 8e4836b2a296e4e78cc86f52014c48d9ad5aaf1a Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Wed, 1 Sep 2021 16:24:14 -0400 Subject: [PATCH] [OpenMP] Use IsHostPtr where needed for targetDataEnd As discussed in D105990, without this patch, `targetDataEnd` determines whether to transfer data or delete a device mapping (as opposed to assuming it's in shared memory) using two different conditions, each of which is broken for some cases: 1. `!(UNIFIED_SHARED_MEMORY && TgtPtrBegin == HstPtrBegin)`: The broken case is rare: the device and host might happen to use the same address for their mapped allocations. I don't know how to write a test that's likely to reveal this case, but this patch does fix it, as discussed below. 2. `!UNIFIED_SHARED_MEMORY || HasCloseModifier`: There are at least two broken cases: 1. The `close` modifier might have been specified on an `omp target enter data` but not the corresponding `omp target exit data`, which thus might falsely assume a mapping is in shared memory. The test `unified_shared_memory/close_enter_exit.c` already has a missing deletion as a result, and this patch adds a check for that. This patch also adds the new test `close_member.c` to reveal a missing transfer and deletion. 2. Use of discrete memory might have been forced by `omp_target_associate_ptr`, as in the test `unified_shared_memory/api.c`. In the current `targetDataEnd` implementation, this condition turns out not be used for this case: because the reference count is infinite, a transfer is possible only with an `always` modifier, and this condition is never used in that case. To ensure it's never used for that case in the future, this patch adds the test `unified_shared_memory/associate_ptr.c`. Fortunately, `DeviceTy::getTgtPtrBegin` already has a solution: it reports whether the allocation was found in shared memory via the variable `IsHostPtr`. After this patch, `HasCloseModifier` is no longer used in `targetDataEnd`, and I wonder if the `close` modifier is ever useful on an `omp target data end`. Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D107925 --- openmp/libomptarget/src/device.cpp | 5 +-- openmp/libomptarget/src/device.h | 10 ++++- openmp/libomptarget/src/omptarget.cpp | 23 ++++------- .../test/unified_shared_memory/associate_ptr.c | 36 ++++++++++++++++++ .../test/unified_shared_memory/close_enter_exit.c | 14 ++++++- .../test/unified_shared_memory/close_member.c | 44 ++++++++++++++++++++++ 6 files changed, 109 insertions(+), 23 deletions(-) create mode 100644 openmp/libomptarget/test/unified_shared_memory/associate_ptr.c create mode 100644 openmp/libomptarget/test/unified_shared_memory/close_member.c diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp index ff5b288..c70d9e8 100644 --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -387,10 +387,7 @@ void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size) { } int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, - bool HasCloseModifier, bool HasHoldModifier) { - if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && - !HasCloseModifier) - return OFFLOAD_SUCCESS; + bool HasHoldModifier) { // Check if the pointer is contained in any sub-nodes. int rc; DataMapMtx.lock(); diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h index 78a8e27..ea87a4b 100644 --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -283,8 +283,14 @@ struct DeviceTy { bool UpdateRefCount, bool UseHoldRefCount, bool &IsHostPtr, bool MustContain = false, bool ForceDelete = false); - int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool HasCloseModifier, - bool HasHoldModifier); + /// For the map entry for \p HstPtrBegin, decrement the reference count + /// specified by \p HasHoldModifier and, if the the total reference count is + /// then zero, deallocate the corresponding device storage and remove the map + /// entry. Return \c OFFLOAD_SUCCESS if the map entry existed, and return + /// \c OFFLOAD_FAIL if not. It is the caller's responsibility to skip calling + /// this function if the map entry is not expected to exist because + /// \p HstPtrBegin uses shared memory. + int deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool HasHoldModifier); int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); int disassociatePtr(void *HstPtrBegin); diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp index 086503a..380e37e 100644 --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -609,15 +609,11 @@ struct DeallocTgtPtrInfo { void *HstPtrBegin; /// Size of the data int64_t DataSize; - /// Whether it has \p close modifier - bool HasCloseModifier; /// Whether it has \p ompx_hold modifier bool HasHoldModifier; - DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool HasCloseModifier, - bool HasHoldModifier) - : HstPtrBegin(HstPtr), DataSize(Size), HasCloseModifier(HasCloseModifier), - HasHoldModifier(HasHoldModifier) {} + DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool HasHoldModifier) + : HstPtrBegin(HstPtr), DataSize(Size), HasHoldModifier(HasHoldModifier) {} }; } // namespace @@ -682,7 +678,6 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum, (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) && !(FromMapper && I == 0); bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE; - bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE; bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; @@ -743,15 +738,12 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum, if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) { bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; bool CopyMember = false; - if (!(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) || - HasCloseModifier) { + if (!IsHostPtr) { if (IsLast) CopyMember = true; } - if ((DelEntry || Always || CopyMember) && - !(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && - TgtPtrBegin == HstPtrBegin)) { + if ((DelEntry || Always || CopyMember) && !IsHostPtr) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, @@ -805,9 +797,8 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum, Device.ShadowMtx.unlock(); // Add pointer to the buffer for later deallocation - if (DelEntry) - DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, HasCloseModifier, - HasHoldModifier); + if (DelEntry && !IsHostPtr) + DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, HasHoldModifier); } } @@ -824,7 +815,7 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum, if (FromMapperBase && FromMapperBase == Info.HstPtrBegin) continue; Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize, - Info.HasCloseModifier, Info.HasHoldModifier); + Info.HasHoldModifier); if (Ret != OFFLOAD_SUCCESS) { REPORT("Deallocating data from device failed.\n"); return OFFLOAD_FAIL; diff --git a/openmp/libomptarget/test/unified_shared_memory/associate_ptr.c b/openmp/libomptarget/test/unified_shared_memory/associate_ptr.c new file mode 100644 index 0000000..7911046 --- /dev/null +++ b/openmp/libomptarget/test/unified_shared_memory/associate_ptr.c @@ -0,0 +1,36 @@ +// RUN: %libomptarget-compile-run-and-check-generic + +// REQUIRES: unified_shared_memory +// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9 + +// Fails on amdgcn with error: GPU Memory Error +// XFAIL: amdgcn-amd-amdhsa + +#include +#include +#include + +#pragma omp requires unified_shared_memory + +int main(int argc, char *argv[]) { + int dev = omp_get_default_device(); + int x = 10; + int *x_dev = (int *)omp_target_alloc(sizeof x, dev); + assert(x_dev && "expected omp_target_alloc to succeed"); + int rc = omp_target_associate_ptr(&x, x_dev, sizeof x, 0, dev); + assert(!rc && "expected omp_target_associate_ptr to succeed"); + + // To determine whether x needs to be transfered, the runtime cannot simply + // check whether unified shared memory is enabled and the 'close' modifier is + // specified. It must check whether x was previously placed in device memory + // by, for example, omp_target_associate_ptr. + #pragma omp target map(always, tofrom: x) + x = 20; + + // CHECK: x=20 + printf("x=%d\n", x); + // CHECK: present: 1 + printf("present: %d\n", omp_target_is_present(&x, dev)); + + return 0; +} diff --git a/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c b/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c index 006fd39..e159ed8 100644 --- a/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c +++ b/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c @@ -17,6 +17,7 @@ int main(int argc, char *argv[]) { int fails; void *host_alloc = 0, *device_alloc = 0; int *a = (int *)malloc(N * sizeof(int)); + int dev = omp_get_default_device(); // Init for (int i = 0; i < N; ++i) { @@ -79,14 +80,25 @@ int main(int argc, char *argv[]) { #pragma omp target enter data map(close, to : a[ : N]) #pragma omp target map(from : device_alloc) - { device_alloc = &a[0]; } + { + device_alloc = &a[0]; + a[0] = 99; + } + // 'close' is missing, so the runtime must check whether s is actually in + // shared memory in order to determine whether to transfer data and delete the + // allocation. #pragma omp target exit data map(from : a[ : N]) // CHECK: a has been mapped to the device. if (device_alloc != host_alloc) printf("a has been mapped to the device.\n"); + // CHECK: a[0]=99 + // CHECK: a is present: 0 + printf("a[0]=%d\n", a[0]); + printf("a is present: %d\n", omp_target_is_present(a, dev)); + free(a); // CHECK: Done! diff --git a/openmp/libomptarget/test/unified_shared_memory/close_member.c b/openmp/libomptarget/test/unified_shared_memory/close_member.c new file mode 100644 index 0000000..1b23c4a --- /dev/null +++ b/openmp/libomptarget/test/unified_shared_memory/close_member.c @@ -0,0 +1,44 @@ +// RUN: %libomptarget-compile-run-and-check-generic + +// REQUIRES: unified_shared_memory +// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9 + +// Fails on amdgcn with error: GPU Memory Error +// XFAIL: amdgcn-amd-amdhsa + +#include +#include + +#pragma omp requires unified_shared_memory + +struct S { + int x; + int y; +}; + +int main(int argc, char *argv[]) { + int dev = omp_get_default_device(); + struct S s = {10, 20}; + + #pragma omp target enter data map(close, to: s) + #pragma omp target map(alloc: s) + { + s.x = 11; + s.y = 21; + } + // To determine whether x needs to be transfered or deleted, the runtime + // cannot simply check whether unified shared memory is enabled and the + // 'close' modifier is specified. It must check whether x was previously + // placed in device memory by, for example, a 'close' modifier that isn't + // specified here. The following struct member case checks a special code + // path in the runtime implementation where members are transferred before + // deletion of the struct. + #pragma omp target exit data map(from: s.x, s.y) + + // CHECK: s.x=11, s.y=21 + printf("s.x=%d, s.y=%d\n", s.x, s.y); + // CHECK: present: 0 + printf("present: %d\n", omp_target_is_present(&s, dev)); + + return 0; +} -- 2.7.4