From: Shilei Tian Date: Tue, 7 Apr 2020 18:51:56 +0000 (-0400) Subject: [OpenMP] Optimized stream selection by scheduling data mapping for the same target... X-Git-Tag: llvmorg-12-init~9773 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=32ed29271fd8c56abee8616e5a16a3c9e58f4741;p=platform%2Fupstream%2Fllvm.git [OpenMP] Optimized stream selection by scheduling data mapping for the same target region into a same stream Summary: This patch introduces two things for offloading: 1. Asynchronous data transferring: those functions are suffix with `_async`. They have one more argument compared with their synchronous counterparts: `__tgt_async_info*`, which is a new struct that only has one field, `void *Identifier`. This struct is for information exchange between different asynchronous operations. It can be used for stream selection, like in this case, or operation synchronization, which is also used. We may expect more usages in the future. 2. Optimization of stream selection for data mapping. Previous implementation was using asynchronous device memory transfer but synchronizing after each memory transfer. Actually, if we say kernel A needs four memory copy to device and two memory copy back to host, then we can schedule these seven operations (four H2D, two D2H, and one kernel launch) into a same stream and just need synchronization after memory copy from device to host. In this way, we can save a huge overhead compared with synchronization after each operation. Reviewers: jdoerfert, ye-luo Reviewed By: jdoerfert Subscribers: yaxunl, lildmh, guansong, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D77005 --- diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h index 826d8ed..de3afc3 100644 --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -111,6 +111,15 @@ struct __tgt_target_table { *EntriesEnd; // End of the table with all the entries (non inclusive) }; +/// This struct contains information exchanged between different asynchronous +/// operations for device-dependent optimization and potential synchronization +struct __tgt_async_info { + // A pointer to a queue-like structure where offloading operations are issued. + // We assume to use this structure to do synchronization. In CUDA backend, it + // is CUstream. + void *Queue = nullptr; +}; + #ifdef __cplusplus extern "C" { #endif diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h index e03416c..b330c19 100644 --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -58,15 +58,21 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t ID, // case an error occurred on the target device. void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr); -// Pass the data content to the target device using the target address. -// In case of success, return zero. Otherwise, return an error code. +// Pass the data content to the target device using the target address. If +// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous. +// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that +// case, it is synchronous. In case of success, return zero. Otherwise, return +// an error code. int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr, - int64_t Size); + int64_t Size, __tgt_async_info *AsyncInfoPtr); -// Retrieve the data content from the target device using its address. -// In case of success, return zero. Otherwise, return an error code. +// Retrieve the data content from the target device using its address. If +// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous. +// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that +// case, it is synchronous. In case of success, return zero. Otherwise, return +// an error code. int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr, - int64_t Size); + int64_t Size, __tgt_async_info *AsyncInfoPtr); // De-allocate the data referenced by target ptr on the device. In case of // success, return zero. Otherwise, return an error code. @@ -75,17 +81,28 @@ int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr); // Transfer control to the offloaded entry Entry on the target device. // Args and Offsets are arrays of NumArgs size of target addresses and // offsets. An offset should be added to the target address before passing it -// to the outlined function on device side. In case of success, return zero. -// Otherwise, return an error code. +// to the outlined function on device side. If AsyncInfoPtr is nullptr, it is +// synchronous; otherwise it is asynchronous. However, AsyncInfoPtr may be +// ignored on some platforms, like x86_64. In that case, it is synchronous. In +// case of success, return zero. Otherwise, return an error code. int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args, - ptrdiff_t *Offsets, int32_t NumArgs); + ptrdiff_t *Offsets, int32_t NumArgs, + __tgt_async_info *AsyncInfoPtr); // Similar to __tgt_rtl_run_target_region, but additionally specify the -// number of teams to be created and a number of threads in each team. +// number of teams to be created and a number of threads in each team. If +// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous. +// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that +// case, it is synchronous. int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args, ptrdiff_t *Offsets, int32_t NumArgs, int32_t NumTeams, int32_t ThreadLimit, - uint64_t loop_tripcount); + uint64_t loop_tripcount, + __tgt_async_info *AsyncInfoPtr); + +// Device synchronization. In case of success, return zero. Otherwise, return an +// error code. +int32_t __tgt_rtl_synchronize(int32_t ID, __tgt_async_info *AsyncInfoPtr); #ifdef __cplusplus } diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp index 54248da..c0fb87b 100644 --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -309,6 +309,68 @@ public: static RTLDeviceInfoTy DeviceInfo; +namespace { +CUstream selectStream(int32_t Id, __tgt_async_info *AsyncInfo) { + if (!AsyncInfo) + return DeviceInfo.getNextStream(Id); + + if (!AsyncInfo->Queue) + AsyncInfo->Queue = DeviceInfo.getNextStream(Id); + + return reinterpret_cast(AsyncInfo->Queue); +} + +int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, + __tgt_async_info *AsyncInfoPtr) { + assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + CUstream Stream = selectStream(DeviceId, AsyncInfoPtr); + + err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from device to host. Pointers: host = " DPxMOD + ", device = " DPxMOD ", size = %" PRId64 "\n", + DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} + +int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, + __tgt_async_info *AsyncInfoPtr) { + assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + CUstream Stream = selectStream(DeviceId, AsyncInfoPtr); + + err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from host to device. Pointers: host = " DPxMOD + ", device = " DPxMOD ", size = %" PRId64 "\n", + DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} +} // namespace + #ifdef __cplusplus extern "C" { #endif @@ -663,69 +725,38 @@ void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { } int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, - int64_t size) { - // Set the context we are using. - CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); - if (err != CUDA_SUCCESS) { - DP("Error when setting CUDA context\n"); - CUDA_ERR_STRING(err); + int64_t size, __tgt_async_info *async_info_ptr) { + // The function dataSubmit is always asynchronous. Considering some data + // transfer must be synchronous, we assume if async_info_ptr is nullptr, the + // transfer will be synchronous by creating a temporary async info and then + // synchronizing after call dataSubmit; otherwise, it is asynchronous. + if (async_info_ptr) + return dataSubmit(device_id, tgt_ptr, hst_ptr, size, async_info_ptr); + + __tgt_async_info async_info; + int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &async_info); + if (rc != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; - } - - CUstream &Stream = DeviceInfo.getNextStream(device_id); - - err = cuMemcpyHtoDAsync((CUdeviceptr)tgt_ptr, hst_ptr, size, Stream); - if (err != CUDA_SUCCESS) { - DP("Error when copying data from host to device. Pointers: host = " DPxMOD - ", device = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); - CUDA_ERR_STRING(err); - return OFFLOAD_FAIL; - } - err = cuStreamSynchronize(Stream); - if (err != CUDA_SUCCESS) { - DP("Error when synchronizing async data transfer from host to device. " - "Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); - CUDA_ERR_STRING(err); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; + return __tgt_rtl_synchronize(device_id, &async_info); } int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, - int64_t size) { - // Set the context we are using. - CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); - if (err != CUDA_SUCCESS) { - DP("Error when setting CUDA context\n"); - CUDA_ERR_STRING(err); + int64_t size, + __tgt_async_info *async_info_ptr) { + // The function dataRetrieve is always asynchronous. Considering some data + // transfer must be synchronous, we assume if async_info_ptr is nullptr, the + // transfer will be synchronous by creating a temporary async info and then + // synchronizing after call dataRetrieve; otherwise, it is asynchronous. + if (async_info_ptr) + return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, async_info_ptr); + + __tgt_async_info async_info; + int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &async_info); + if (rc != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; - } - CUstream &Stream = DeviceInfo.getNextStream(device_id); - - err = cuMemcpyDtoHAsync(hst_ptr, (CUdeviceptr)tgt_ptr, size, Stream); - if (err != CUDA_SUCCESS) { - DP("Error when copying data from device to host. Pointers: host = " DPxMOD - ", device = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); - CUDA_ERR_STRING(err); - return OFFLOAD_FAIL; - } - - err = cuStreamSynchronize(Stream); - if (err != CUDA_SUCCESS) { - DP("Error when synchronizing async data transfer from device to host. " - "Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); - CUDA_ERR_STRING(err); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; + return __tgt_rtl_synchronize(device_id, &async_info); } int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { @@ -747,8 +778,12 @@ int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { } int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, - int32_t thread_limit, uint64_t loop_tripcount) { + void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, int32_t team_num, + int32_t thread_limit, + uint64_t loop_tripcount, + __tgt_async_info *async_info) { // Set the context we are using. CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); if (err != CUDA_SUCCESS) { @@ -844,8 +879,7 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, cudaThreadsPerBlock); - CUstream &Stream = DeviceInfo.getNextStream(device_id); - + CUstream Stream = selectStream(device_id, async_info); err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, Stream, &args[0], 0); @@ -858,25 +892,35 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, DP("Launch of entry point at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); - CUresult sync_err = cuStreamSynchronize(Stream); - if (sync_err != CUDA_SUCCESS) { - DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr)); - CUDA_ERR_STRING(sync_err); - return OFFLOAD_FAIL; - } else { - DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); - } - return OFFLOAD_SUCCESS; } int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num, + __tgt_async_info *async_info) { // use one team and the default number of threads. const int32_t team_num = 1; const int32_t thread_limit = 0; return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, - tgt_offsets, arg_num, team_num, thread_limit, 0); + tgt_offsets, arg_num, team_num, + thread_limit, 0, async_info); +} + +int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) { + assert(async_info && "async_info is nullptr"); + assert(async_info->Queue && "async_info->Queue is nullptr"); + + CUstream Stream = reinterpret_cast(async_info->Queue); + CUresult Err = cuStreamSynchronize(Stream); + if (Err != CUDA_SUCCESS) { + DP("Error when synchronizing stream. stream = " DPxMOD + ", async info ptr = " DPxMOD "\n", + DPxPTR(Stream), DPxPTR(async_info)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; } #ifdef __cplusplus diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports index a14bedf..cbbad6d 100644 --- a/openmp/libomptarget/plugins/exports +++ b/openmp/libomptarget/plugins/exports @@ -11,6 +11,7 @@ VERS1.0 { __tgt_rtl_data_delete; __tgt_rtl_run_target_team_region; __tgt_rtl_run_target_region; + __tgt_rtl_synchronize; local: *; }; diff --git a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp index 00e58d8..84875f5 100644 --- a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp +++ b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp @@ -277,13 +277,13 @@ void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { } int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, - int64_t size) { + int64_t size, __tgt_async_info *) { memcpy(tgt_ptr, hst_ptr, size); return OFFLOAD_SUCCESS; } int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, - int64_t size) { + int64_t size, __tgt_async_info *) { memcpy(hst_ptr, tgt_ptr, size); return OFFLOAD_SUCCESS; } @@ -293,9 +293,11 @@ int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { return OFFLOAD_SUCCESS; } -int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, - int32_t thread_limit, uint64_t loop_tripcount /*not used*/) { +int32_t __tgt_rtl_run_target_team_region( + int32_t device_id, void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, + int32_t thread_limit, uint64_t loop_tripcount /*not used*/, + __tgt_async_info *async_info /*not used*/) { // ignore team num and thread limit. // Use libffi to launch execution. @@ -328,10 +330,18 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, } int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num, + __tgt_async_info *async_info_ptr) { // use one team and one thread. return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, - tgt_offsets, arg_num, 1, 1, 0); + tgt_offsets, arg_num, 1, 1, 0, + async_info_ptr); +} + +int32_t __tgt_rtl_synchronize(int32_t device_id, + __tgt_async_info *async_info_ptr) { + return OFFLOAD_SUCCESS; } #ifdef __cplusplus diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index cabe639..3c7b709 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -161,19 +161,19 @@ EXTERN int omp_target_memcpy(void *dst, void *src, size_t length, } else if (src_device == omp_get_initial_device()) { DP("copy from host to device\n"); DeviceTy& DstDev = Devices[dst_device]; - rc = DstDev.data_submit(dstAddr, srcAddr, length); + rc = DstDev.data_submit(dstAddr, srcAddr, length, nullptr); } else if (dst_device == omp_get_initial_device()) { DP("copy from device to host\n"); DeviceTy& SrcDev = Devices[src_device]; - rc = SrcDev.data_retrieve(dstAddr, srcAddr, length); + rc = SrcDev.data_retrieve(dstAddr, srcAddr, length, nullptr); } else { DP("copy from device to device\n"); void *buffer = malloc(length); DeviceTy& SrcDev = Devices[src_device]; DeviceTy& DstDev = Devices[dst_device]; - rc = SrcDev.data_retrieve(buffer, srcAddr, length); + rc = SrcDev.data_retrieve(buffer, srcAddr, length, nullptr); if (rc == OFFLOAD_SUCCESS) - rc = DstDev.data_submit(dstAddr, buffer, length); + rc = DstDev.data_submit(dstAddr, buffer, length, nullptr); free(buffer); } diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp index e215a5d..09ddcce 100644 --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -331,31 +331,38 @@ __tgt_target_table *DeviceTy::load_binary(void *Img) { return rc; } -// Submit data to device. +// Submit data to device int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin, - int64_t Size) { - return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size); + int64_t Size, __tgt_async_info *AsyncInfoPtr) { + + return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size, + AsyncInfoPtr); } -// Retrieve data from device. +// Retrieve data from device int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, - int64_t Size) { - return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size); + int64_t Size, __tgt_async_info *AsyncInfoPtr) { + return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size, + AsyncInfoPtr); } // Run region on device int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr, - ptrdiff_t *TgtOffsets, int32_t TgtVarsSize) { + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, + __tgt_async_info *AsyncInfo) { return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, - TgtVarsSize); + TgtVarsSize, AsyncInfo); } // Run team region on device. int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, - ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams, - int32_t ThreadLimit, uint64_t LoopTripCount) { + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, + int32_t NumTeams, int32_t ThreadLimit, + uint64_t LoopTripCount, + __tgt_async_info *AsyncInfo) { return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, - TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount); + TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount, + AsyncInfo); } /// Check whether a device has an associated RTL and initialize it if it's not diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h index 8379f0c..e44adaf 100644 --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -24,6 +24,7 @@ struct RTLInfoTy; struct __tgt_bin_desc; struct __tgt_target_table; +struct __tgt_async_info; /// Map between host data and target data. struct HostDataToTargetTy { @@ -173,14 +174,20 @@ struct DeviceTy { int32_t initOnce(); __tgt_target_table *load_binary(void *Img); - int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size); - int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); + // Asynchronous data transfer. When AsyncInfoPtr is nullptr, the transfer will + // be synchronous. + int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, + __tgt_async_info *AsyncInfoPtr); + int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size, + __tgt_async_info *AsyncInfoPtr); int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr, - ptrdiff_t *TgtOffsets, int32_t TgtVarsSize); + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, + __tgt_async_info *AsyncInfo); int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, - ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams, - int32_t ThreadLimit, uint64_t LoopTripCount); + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, + int32_t NumTeams, int32_t ThreadLimit, + uint64_t LoopTripCount, __tgt_async_info *AsyncInfo); private: // Call to RTL diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp index b84cc88..4517a89 100644 --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -215,8 +215,9 @@ static int32_t member_of(int64_t type) { } /// Internal function to do the mapping and transfer the data to the device -int target_data_begin(DeviceTy &Device, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { +int target_data_begin(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + __tgt_async_info *async_info_ptr) { // process each input. for (int32_t i = 0; i < arg_num; ++i) { // Ignore private variables and arrays - there is no mapping for them. @@ -316,8 +317,9 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num, if (copy && !IsHostPtr) { DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", - data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); - int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size); + data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size, + async_info_ptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -331,7 +333,7 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num, uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase, - sizeof(void *)); + sizeof(void *), async_info_ptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -349,7 +351,8 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num, /// Internal function to undo the mapping and retrieve the data from the device. int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types) { + void **args, int64_t *arg_sizes, int64_t *arg_types, + __tgt_async_info *async_info_ptr) { // process each input. for (int32_t i = arg_num - 1; i >= 0; --i) { // Ignore private variables and arrays - there is no mapping for them. @@ -419,8 +422,9 @@ int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, !(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && TgtPtrBegin == HstPtrBegin)) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", - data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size); + data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); + int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size, + async_info_ptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data from device failed.\n"); return OFFLOAD_FAIL; @@ -509,7 +513,7 @@ int target_data_update(DeviceTy &Device, int32_t arg_num, if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize); + int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize, nullptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data from device failed.\n"); return OFFLOAD_FAIL; @@ -536,7 +540,7 @@ int target_data_update(DeviceTy &Device, int32_t arg_num, if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); - int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize); + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize, nullptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -556,7 +560,7 @@ int target_data_update(DeviceTy &Device, int32_t arg_num, "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal), DPxPTR(it->second.TgtPtrAddr)); rt = Device.data_submit(it->second.TgtPtrAddr, - &it->second.TgtPtrVal, sizeof(void *)); + &it->second.TgtPtrVal, sizeof(void *), nullptr); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); Device.ShadowMtx.unlock(); @@ -638,9 +642,11 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num, TrlTblMtx->unlock(); assert(TargetTable && "Global data has not been mapped\n"); + __tgt_async_info AsyncInfo; + // Move data to device. int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes, - arg_types); + arg_types, &AsyncInfo); if (rc != OFFLOAD_SUCCESS) { DP("Call to target_data_begin failed, abort target.\n"); return OFFLOAD_FAIL; @@ -691,7 +697,7 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num, DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin, - sizeof(void *)); + sizeof(void *), &AsyncInfo); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -732,9 +738,10 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num, #endif // If first-private, copy data from host if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { - int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]); + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i], + &AsyncInfo); if (rt != OFFLOAD_SUCCESS) { - DP ("Copying data to device failed, failed.\n"); + DP("Copying data to device failed, failed.\n"); return OFFLOAD_FAIL; } } @@ -780,11 +787,12 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num, DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index); if (IsTeamConstruct) { rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr, - &tgt_args[0], &tgt_offsets[0], tgt_args.size(), team_num, - thread_limit, ltc); + &tgt_args[0], &tgt_offsets[0], tgt_args.size(), + team_num, thread_limit, ltc, &AsyncInfo); } else { rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr, - &tgt_args[0], &tgt_offsets[0], tgt_args.size()); + &tgt_args[0], &tgt_offsets[0], tgt_args.size(), + &AsyncInfo); } if (rc != OFFLOAD_SUCCESS) { DP ("Executing target region abort target.\n"); @@ -802,11 +810,11 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num, // Move data from device. int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes, - arg_types); + arg_types, &AsyncInfo); if (rt != OFFLOAD_SUCCESS) { DP("Call to target_data_end failed, abort targe.\n"); return OFFLOAD_FAIL; } - return OFFLOAD_SUCCESS; + return Device.RTL->synchronize(device_id, &AsyncInfo); } diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h index 7aaa295..6e6b39f 100644 --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -18,10 +18,13 @@ #include extern int target_data_begin(DeviceTy &Device, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types); + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, + __tgt_async_info *async_info_ptr = nullptr); extern int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types); + void **args, int64_t *arg_sizes, int64_t *arg_types, + __tgt_async_info *async_info_ptr = nullptr); extern int target_data_update(DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types); diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp index 3b9efd6..ed0be2c 100644 --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -126,6 +126,9 @@ void RTLsTy::LoadRTLs() { if (!(*((void**) &R.run_team_region) = dlsym( dynlib_handle, "__tgt_rtl_run_target_team_region"))) continue; + if (!(*((void**) &R.synchronize) = dlsym( + dynlib_handle, "__tgt_rtl_synchronize"))) + continue; // Optional functions *((void**) &R.init_requires) = dlsym( diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h index 439efec..846c89b 100644 --- a/openmp/libomptarget/src/rtl.h +++ b/openmp/libomptarget/src/rtl.h @@ -30,14 +30,18 @@ struct RTLInfoTy { typedef int32_t(init_device_ty)(int32_t); typedef __tgt_target_table *(load_binary_ty)(int32_t, void *); typedef void *(data_alloc_ty)(int32_t, int64_t, void *); - typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t); - typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t); + typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t, + __tgt_async_info *); + typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t, + __tgt_async_info *); typedef int32_t(data_delete_ty)(int32_t, void *); - typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, - int32_t); + typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, int32_t, + __tgt_async_info *); typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *, - int32_t, int32_t, int32_t, uint64_t); + int32_t, int32_t, int32_t, uint64_t, + __tgt_async_info *); typedef int64_t(init_requires_ty)(int64_t); + typedef int64_t(synchronize_ty)(int64_t, __tgt_async_info *); int32_t Idx = -1; // RTL index, index is the number of devices // of other RTLs that were registered before, @@ -63,6 +67,7 @@ struct RTLInfoTy { run_region_ty *run_region = nullptr; run_team_region_ty *run_team_region = nullptr; init_requires_ty *init_requires = nullptr; + synchronize_ty *synchronize = nullptr; // Are there images associated with this RTL. bool isUsed = false; @@ -95,6 +100,7 @@ struct RTLInfoTy { run_team_region = r.run_team_region; init_requires = r.init_requires; isUsed = r.isUsed; + synchronize = r.synchronize; } };