*EntriesEnd; // End of the table with all the entries (non inclusive)
};
+/// This struct contains information exchanged between different asynchronous
+/// operations for device-dependent optimization and potential synchronization
+struct __tgt_async_info {
+ // A pointer to a queue-like structure where offloading operations are issued.
+ // We assume to use this structure to do synchronization. In CUDA backend, it
+ // is CUstream.
+ void *Queue = nullptr;
+};
+
#ifdef __cplusplus
extern "C" {
#endif
// case an error occurred on the target device.
void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr);
-// Pass the data content to the target device using the target address.
-// In case of success, return zero. Otherwise, return an error code.
+// Pass the data content to the target device using the target address. If
+// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous.
+// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that
+// case, it is synchronous. In case of success, return zero. Otherwise, return
+// an error code.
int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr,
- int64_t Size);
+ int64_t Size, __tgt_async_info *AsyncInfoPtr);
-// Retrieve the data content from the target device using its address.
-// In case of success, return zero. Otherwise, return an error code.
+// Retrieve the data content from the target device using its address. If
+// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous.
+// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that
+// case, it is synchronous. In case of success, return zero. Otherwise, return
+// an error code.
int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
- int64_t Size);
+ int64_t Size, __tgt_async_info *AsyncInfoPtr);
// De-allocate the data referenced by target ptr on the device. In case of
// success, return zero. Otherwise, return an error code.
// Transfer control to the offloaded entry Entry on the target device.
// Args and Offsets are arrays of NumArgs size of target addresses and
// offsets. An offset should be added to the target address before passing it
-// to the outlined function on device side. In case of success, return zero.
-// Otherwise, return an error code.
+// to the outlined function on device side. If AsyncInfoPtr is nullptr, it is
+// synchronous; otherwise it is asynchronous. However, AsyncInfoPtr may be
+// ignored on some platforms, like x86_64. In that case, it is synchronous. In
+// case of success, return zero. Otherwise, return an error code.
int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args,
- ptrdiff_t *Offsets, int32_t NumArgs);
+ ptrdiff_t *Offsets, int32_t NumArgs,
+ __tgt_async_info *AsyncInfoPtr);
// Similar to __tgt_rtl_run_target_region, but additionally specify the
-// number of teams to be created and a number of threads in each team.
+// number of teams to be created and a number of threads in each team. If
+// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous.
+// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that
+// case, it is synchronous.
int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args,
ptrdiff_t *Offsets, int32_t NumArgs,
int32_t NumTeams, int32_t ThreadLimit,
- uint64_t loop_tripcount);
+ uint64_t loop_tripcount,
+ __tgt_async_info *AsyncInfoPtr);
+
+// Device synchronization. In case of success, return zero. Otherwise, return an
+// error code.
+int32_t __tgt_rtl_synchronize(int32_t ID, __tgt_async_info *AsyncInfoPtr);
#ifdef __cplusplus
}
static RTLDeviceInfoTy DeviceInfo;
+namespace {
+CUstream selectStream(int32_t Id, __tgt_async_info *AsyncInfo) {
+ if (!AsyncInfo)
+ return DeviceInfo.getNextStream(Id);
+
+ if (!AsyncInfo->Queue)
+ AsyncInfo->Queue = DeviceInfo.getNextStream(Id);
+
+ return reinterpret_cast<CUstream>(AsyncInfo->Queue);
+}
+
+int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
+ __tgt_async_info *AsyncInfoPtr) {
+ assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+ // Set the context we are using.
+ CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when setting CUDA context\n");
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ CUstream Stream = selectStream(DeviceId, AsyncInfoPtr);
+
+ err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when copying data from device to host. Pointers: host = " DPxMOD
+ ", device = " DPxMOD ", size = %" PRId64 "\n",
+ DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
+ __tgt_async_info *AsyncInfoPtr) {
+ assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+ // Set the context we are using.
+ CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when setting CUDA context\n");
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ CUstream Stream = selectStream(DeviceId, AsyncInfoPtr);
+
+ err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when copying data from host to device. Pointers: host = " DPxMOD
+ ", device = " DPxMOD ", size = %" PRId64 "\n",
+ DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ return OFFLOAD_SUCCESS;
+}
+} // namespace
+
#ifdef __cplusplus
extern "C" {
#endif
}
int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
- int64_t size) {
- // Set the context we are using.
- CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
- if (err != CUDA_SUCCESS) {
- DP("Error when setting CUDA context\n");
- CUDA_ERR_STRING(err);
+ int64_t size, __tgt_async_info *async_info_ptr) {
+ // The function dataSubmit is always asynchronous. Considering some data
+ // transfer must be synchronous, we assume if async_info_ptr is nullptr, the
+ // transfer will be synchronous by creating a temporary async info and then
+ // synchronizing after call dataSubmit; otherwise, it is asynchronous.
+ if (async_info_ptr)
+ return dataSubmit(device_id, tgt_ptr, hst_ptr, size, async_info_ptr);
+
+ __tgt_async_info async_info;
+ int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &async_info);
+ if (rc != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
- }
-
- CUstream &Stream = DeviceInfo.getNextStream(device_id);
-
- err = cuMemcpyHtoDAsync((CUdeviceptr)tgt_ptr, hst_ptr, size, Stream);
- if (err != CUDA_SUCCESS) {
- DP("Error when copying data from host to device. Pointers: host = " DPxMOD
- ", device = " DPxMOD ", size = %" PRId64 "\n",
- DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size);
- CUDA_ERR_STRING(err);
- return OFFLOAD_FAIL;
- }
- err = cuStreamSynchronize(Stream);
- if (err != CUDA_SUCCESS) {
- DP("Error when synchronizing async data transfer from host to device. "
- "Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
- DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size);
- CUDA_ERR_STRING(err);
- return OFFLOAD_FAIL;
- }
-
- return OFFLOAD_SUCCESS;
+ return __tgt_rtl_synchronize(device_id, &async_info);
}
int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
- int64_t size) {
- // Set the context we are using.
- CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
- if (err != CUDA_SUCCESS) {
- DP("Error when setting CUDA context\n");
- CUDA_ERR_STRING(err);
+ int64_t size,
+ __tgt_async_info *async_info_ptr) {
+ // The function dataRetrieve is always asynchronous. Considering some data
+ // transfer must be synchronous, we assume if async_info_ptr is nullptr, the
+ // transfer will be synchronous by creating a temporary async info and then
+ // synchronizing after call dataRetrieve; otherwise, it is asynchronous.
+ if (async_info_ptr)
+ return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, async_info_ptr);
+
+ __tgt_async_info async_info;
+ int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &async_info);
+ if (rc != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
- }
- CUstream &Stream = DeviceInfo.getNextStream(device_id);
-
- err = cuMemcpyDtoHAsync(hst_ptr, (CUdeviceptr)tgt_ptr, size, Stream);
- if (err != CUDA_SUCCESS) {
- DP("Error when copying data from device to host. Pointers: host = " DPxMOD
- ", device = " DPxMOD ", size = %" PRId64 "\n",
- DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size);
- CUDA_ERR_STRING(err);
- return OFFLOAD_FAIL;
- }
-
- err = cuStreamSynchronize(Stream);
- if (err != CUDA_SUCCESS) {
- DP("Error when synchronizing async data transfer from device to host. "
- "Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
- DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size);
- CUDA_ERR_STRING(err);
- return OFFLOAD_FAIL;
- }
-
- return OFFLOAD_SUCCESS;
+ return __tgt_rtl_synchronize(device_id, &async_info);
}
int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
}
int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
- void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
- int32_t thread_limit, uint64_t loop_tripcount) {
+ void **tgt_args,
+ ptrdiff_t *tgt_offsets,
+ int32_t arg_num, int32_t team_num,
+ int32_t thread_limit,
+ uint64_t loop_tripcount,
+ __tgt_async_info *async_info) {
// Set the context we are using.
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
if (err != CUDA_SUCCESS) {
DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
cudaThreadsPerBlock);
- CUstream &Stream = DeviceInfo.getNextStream(device_id);
-
+ CUstream Stream = selectStream(device_id, async_info);
err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/,
Stream, &args[0], 0);
DP("Launch of entry point at " DPxMOD " successful!\n",
DPxPTR(tgt_entry_ptr));
- CUresult sync_err = cuStreamSynchronize(Stream);
- if (sync_err != CUDA_SUCCESS) {
- DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr));
- CUDA_ERR_STRING(sync_err);
- return OFFLOAD_FAIL;
- } else {
- DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
- }
-
return OFFLOAD_SUCCESS;
}
int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
- void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
+ void **tgt_args, ptrdiff_t *tgt_offsets,
+ int32_t arg_num,
+ __tgt_async_info *async_info) {
// use one team and the default number of threads.
const int32_t team_num = 1;
const int32_t thread_limit = 0;
return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
- tgt_offsets, arg_num, team_num, thread_limit, 0);
+ tgt_offsets, arg_num, team_num,
+ thread_limit, 0, async_info);
+}
+
+int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) {
+ assert(async_info && "async_info is nullptr");
+ assert(async_info->Queue && "async_info->Queue is nullptr");
+
+ CUstream Stream = reinterpret_cast<CUstream>(async_info->Queue);
+ CUresult Err = cuStreamSynchronize(Stream);
+ if (Err != CUDA_SUCCESS) {
+ DP("Error when synchronizing stream. stream = " DPxMOD
+ ", async info ptr = " DPxMOD "\n",
+ DPxPTR(Stream), DPxPTR(async_info));
+ CUDA_ERR_STRING(Err);
+ return OFFLOAD_FAIL;
+ }
+ return OFFLOAD_SUCCESS;
}
#ifdef __cplusplus
__tgt_rtl_data_delete;
__tgt_rtl_run_target_team_region;
__tgt_rtl_run_target_region;
+ __tgt_rtl_synchronize;
local:
*;
};
}
int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
- int64_t size) {
+ int64_t size, __tgt_async_info *) {
memcpy(tgt_ptr, hst_ptr, size);
return OFFLOAD_SUCCESS;
}
int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
- int64_t size) {
+ int64_t size, __tgt_async_info *) {
memcpy(hst_ptr, tgt_ptr, size);
return OFFLOAD_SUCCESS;
}
return OFFLOAD_SUCCESS;
}
-int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
- void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
- int32_t thread_limit, uint64_t loop_tripcount /*not used*/) {
+int32_t __tgt_rtl_run_target_team_region(
+ int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
+ ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
+ int32_t thread_limit, uint64_t loop_tripcount /*not used*/,
+ __tgt_async_info *async_info /*not used*/) {
// ignore team num and thread limit.
// Use libffi to launch execution.
}
int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
- void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
+ void **tgt_args, ptrdiff_t *tgt_offsets,
+ int32_t arg_num,
+ __tgt_async_info *async_info_ptr) {
// use one team and one thread.
return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
- tgt_offsets, arg_num, 1, 1, 0);
+ tgt_offsets, arg_num, 1, 1, 0,
+ async_info_ptr);
+}
+
+int32_t __tgt_rtl_synchronize(int32_t device_id,
+ __tgt_async_info *async_info_ptr) {
+ return OFFLOAD_SUCCESS;
}
#ifdef __cplusplus
} else if (src_device == omp_get_initial_device()) {
DP("copy from host to device\n");
DeviceTy& DstDev = Devices[dst_device];
- rc = DstDev.data_submit(dstAddr, srcAddr, length);
+ rc = DstDev.data_submit(dstAddr, srcAddr, length, nullptr);
} else if (dst_device == omp_get_initial_device()) {
DP("copy from device to host\n");
DeviceTy& SrcDev = Devices[src_device];
- rc = SrcDev.data_retrieve(dstAddr, srcAddr, length);
+ rc = SrcDev.data_retrieve(dstAddr, srcAddr, length, nullptr);
} else {
DP("copy from device to device\n");
void *buffer = malloc(length);
DeviceTy& SrcDev = Devices[src_device];
DeviceTy& DstDev = Devices[dst_device];
- rc = SrcDev.data_retrieve(buffer, srcAddr, length);
+ rc = SrcDev.data_retrieve(buffer, srcAddr, length, nullptr);
if (rc == OFFLOAD_SUCCESS)
- rc = DstDev.data_submit(dstAddr, buffer, length);
+ rc = DstDev.data_submit(dstAddr, buffer, length, nullptr);
free(buffer);
}
return rc;
}
-// Submit data to device.
+// Submit data to device
int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin,
- int64_t Size) {
- return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
+ int64_t Size, __tgt_async_info *AsyncInfoPtr) {
+
+ return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
+ AsyncInfoPtr);
}
-// Retrieve data from device.
+// Retrieve data from device
int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin,
- int64_t Size) {
- return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
+ int64_t Size, __tgt_async_info *AsyncInfoPtr) {
+ return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
+ AsyncInfoPtr);
}
// Run region on device
int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr,
- ptrdiff_t *TgtOffsets, int32_t TgtVarsSize) {
+ ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
+ __tgt_async_info *AsyncInfo) {
return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
- TgtVarsSize);
+ TgtVarsSize, AsyncInfo);
}
// Run team region on device.
int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
- ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams,
- int32_t ThreadLimit, uint64_t LoopTripCount) {
+ ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
+ int32_t NumTeams, int32_t ThreadLimit,
+ uint64_t LoopTripCount,
+ __tgt_async_info *AsyncInfo) {
return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
- TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount);
+ TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount,
+ AsyncInfo);
}
/// Check whether a device has an associated RTL and initialize it if it's not
struct RTLInfoTy;
struct __tgt_bin_desc;
struct __tgt_target_table;
+struct __tgt_async_info;
/// Map between host data and target data.
struct HostDataToTargetTy {
int32_t initOnce();
__tgt_target_table *load_binary(void *Img);
- int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size);
- int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
+ // Asynchronous data transfer. When AsyncInfoPtr is nullptr, the transfer will
+ // be synchronous.
+ int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
+ __tgt_async_info *AsyncInfoPtr);
+ int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size,
+ __tgt_async_info *AsyncInfoPtr);
int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr,
- ptrdiff_t *TgtOffsets, int32_t TgtVarsSize);
+ ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
+ __tgt_async_info *AsyncInfo);
int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
- ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams,
- int32_t ThreadLimit, uint64_t LoopTripCount);
+ ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
+ int32_t NumTeams, int32_t ThreadLimit,
+ uint64_t LoopTripCount, __tgt_async_info *AsyncInfo);
private:
// Call to RTL
}
/// Internal function to do the mapping and transfer the data to the device
-int target_data_begin(DeviceTy &Device, int32_t arg_num,
- void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+int target_data_begin(DeviceTy &Device, int32_t arg_num, void **args_base,
+ void **args, int64_t *arg_sizes, int64_t *arg_types,
+ __tgt_async_info *async_info_ptr) {
// process each input.
for (int32_t i = 0; i < arg_num; ++i) {
// Ignore private variables and arrays - there is no mapping for them.
if (copy && !IsHostPtr) {
DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
- data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
- int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size);
+ data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
+ int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size,
+ async_info_ptr);
if (rt != OFFLOAD_SUCCESS) {
DP("Copying data to device failed.\n");
return OFFLOAD_FAIL;
uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase,
- sizeof(void *));
+ sizeof(void *), async_info_ptr);
if (rt != OFFLOAD_SUCCESS) {
DP("Copying data to device failed.\n");
return OFFLOAD_FAIL;
/// Internal function to undo the mapping and retrieve the data from the device.
int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
- void **args, int64_t *arg_sizes, int64_t *arg_types) {
+ void **args, int64_t *arg_sizes, int64_t *arg_types,
+ __tgt_async_info *async_info_ptr) {
// process each input.
for (int32_t i = arg_num - 1; i >= 0; --i) {
// Ignore private variables and arrays - there is no mapping for them.
!(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
TgtPtrBegin == HstPtrBegin)) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
- data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
- int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size);
+ data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
+ int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size,
+ async_info_ptr);
if (rt != OFFLOAD_SUCCESS) {
DP("Copying data from device failed.\n");
return OFFLOAD_FAIL;
if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
- int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize);
+ int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize, nullptr);
if (rt != OFFLOAD_SUCCESS) {
DP("Copying data from device failed.\n");
return OFFLOAD_FAIL;
if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
- int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize);
+ int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize, nullptr);
if (rt != OFFLOAD_SUCCESS) {
DP("Copying data to device failed.\n");
return OFFLOAD_FAIL;
"pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal),
DPxPTR(it->second.TgtPtrAddr));
rt = Device.data_submit(it->second.TgtPtrAddr,
- &it->second.TgtPtrVal, sizeof(void *));
+ &it->second.TgtPtrVal, sizeof(void *), nullptr);
if (rt != OFFLOAD_SUCCESS) {
DP("Copying data to device failed.\n");
Device.ShadowMtx.unlock();
TrlTblMtx->unlock();
assert(TargetTable && "Global data has not been mapped\n");
+ __tgt_async_info AsyncInfo;
+
// Move data to device.
int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes,
- arg_types);
+ arg_types, &AsyncInfo);
if (rc != OFFLOAD_SUCCESS) {
DP("Call to target_data_begin failed, abort target.\n");
return OFFLOAD_FAIL;
DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin,
- sizeof(void *));
+ sizeof(void *), &AsyncInfo);
if (rt != OFFLOAD_SUCCESS) {
DP("Copying data to device failed.\n");
return OFFLOAD_FAIL;
#endif
// If first-private, copy data from host
if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
- int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
+ int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i],
+ &AsyncInfo);
if (rt != OFFLOAD_SUCCESS) {
- DP ("Copying data to device failed, failed.\n");
+ DP("Copying data to device failed, failed.\n");
return OFFLOAD_FAIL;
}
}
DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index);
if (IsTeamConstruct) {
rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr,
- &tgt_args[0], &tgt_offsets[0], tgt_args.size(), team_num,
- thread_limit, ltc);
+ &tgt_args[0], &tgt_offsets[0], tgt_args.size(),
+ team_num, thread_limit, ltc, &AsyncInfo);
} else {
rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr,
- &tgt_args[0], &tgt_offsets[0], tgt_args.size());
+ &tgt_args[0], &tgt_offsets[0], tgt_args.size(),
+ &AsyncInfo);
}
if (rc != OFFLOAD_SUCCESS) {
DP ("Executing target region abort target.\n");
// Move data from device.
int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes,
- arg_types);
+ arg_types, &AsyncInfo);
if (rt != OFFLOAD_SUCCESS) {
DP("Call to target_data_end failed, abort targe.\n");
return OFFLOAD_FAIL;
}
- return OFFLOAD_SUCCESS;
+ return Device.RTL->synchronize(device_id, &AsyncInfo);
}
#include <cstdint>
extern int target_data_begin(DeviceTy &Device, int32_t arg_num,
- void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types);
+ void **args_base, void **args, int64_t *arg_sizes,
+ int64_t *arg_types,
+ __tgt_async_info *async_info_ptr = nullptr);
extern int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
- void **args, int64_t *arg_sizes, int64_t *arg_types);
+ void **args, int64_t *arg_sizes, int64_t *arg_types,
+ __tgt_async_info *async_info_ptr = nullptr);
extern int target_data_update(DeviceTy &Device, int32_t arg_num,
void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types);
if (!(*((void**) &R.run_team_region) = dlsym(
dynlib_handle, "__tgt_rtl_run_target_team_region")))
continue;
+ if (!(*((void**) &R.synchronize) = dlsym(
+ dynlib_handle, "__tgt_rtl_synchronize")))
+ continue;
// Optional functions
*((void**) &R.init_requires) = dlsym(
typedef int32_t(init_device_ty)(int32_t);
typedef __tgt_target_table *(load_binary_ty)(int32_t, void *);
typedef void *(data_alloc_ty)(int32_t, int64_t, void *);
- typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t);
- typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t);
+ typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t,
+ __tgt_async_info *);
+ typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t,
+ __tgt_async_info *);
typedef int32_t(data_delete_ty)(int32_t, void *);
- typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *,
- int32_t);
+ typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, int32_t,
+ __tgt_async_info *);
typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *,
- int32_t, int32_t, int32_t, uint64_t);
+ int32_t, int32_t, int32_t, uint64_t,
+ __tgt_async_info *);
typedef int64_t(init_requires_ty)(int64_t);
+ typedef int64_t(synchronize_ty)(int64_t, __tgt_async_info *);
int32_t Idx = -1; // RTL index, index is the number of devices
// of other RTLs that were registered before,
run_region_ty *run_region = nullptr;
run_team_region_ty *run_team_region = nullptr;
init_requires_ty *init_requires = nullptr;
+ synchronize_ty *synchronize = nullptr;
// Are there images associated with this RTL.
bool isUsed = false;
run_team_region = r.run_team_region;
init_requires = r.init_requires;
isUsed = r.isUsed;
+ synchronize = r.synchronize;
}
};