static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
cl::init(false), cl::Hidden);
+static cl::opt<bool> HideMemoryTransferLatency(
+ "openmp-hide-memory-transfer-latency",
+ cl::desc("[WIP] Tries to hide the latency of host to device memory"
+ " transfers"),
+ cl::Hidden, cl::init(false));
+
+
STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
"Number of OpenMP runtime calls deduplicated");
STATISTIC(NumOpenMPParallelRegionsDeleted,
Changed |= deduplicateRuntimeCalls();
Changed |= deleteParallelRegions();
+ if (HideMemoryTransferLatency)
+ Changed |= hideMemTransfersLatency();
return Changed;
}
return Changed;
}
+ /// Tries to hide the latency of runtime calls that involve host to
+ /// device memory transfers by splitting them into their "issue" and "wait"
+ /// versions. The "issue" is moved upwards as much as possible. The "wait" is
+ /// moved downards as much as possible. The "issue" issues the memory transfer
+ /// asynchronously, returning a handle. The "wait" waits in the returned
+ /// handle for the memory transfer to finish.
+ bool hideMemTransfersLatency() {
+ auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
+ bool Changed = false;
+ auto SplitMemTransfers = [&](Use &U, Function &Decl) {
+ auto *RTCall = getCallIfRegularCall(U, &RFI);
+ if (!RTCall)
+ return false;
+
+ bool WasSplit = splitTargetDataBeginRTC(RTCall);
+ Changed |= WasSplit;
+ return WasSplit;
+ };
+ RFI.foreachUse(SCC, SplitMemTransfers);
+
+ return Changed;
+ }
+
+ /// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
+ bool splitTargetDataBeginRTC(CallInst *RuntimeCall) {
+ auto &IRBuilder = OMPInfoCache.OMPBuilder;
+ // Add "issue" runtime call declaration:
+ // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
+ // i8**, i8**, i64*, i64*)
+ FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
+ M, OMPRTL___tgt_target_data_begin_mapper_issue);
+
+ // Change RuntimeCall call site for its asynchronous version.
+ SmallVector<Value *, 8> Args;
+ for (auto &Arg : RuntimeCall->args())
+ Args.push_back(Arg.get());
+
+ CallInst *IssueCallsite =
+ CallInst::Create(IssueDecl, Args, "handle", RuntimeCall);
+ RuntimeCall->eraseFromParent();
+
+ // Add "wait" runtime call declaration:
+ // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
+ FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
+ M, OMPRTL___tgt_target_data_begin_mapper_wait);
+
+ // Add call site to WaitDecl.
+ Value *WaitParams[2] = {
+ IssueCallsite->getArgOperand(0), // device_id.
+ IssueCallsite // returned handle.
+ };
+ CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"",
+ IssueCallsite->getNextNode());
+
+ return true;
+ }
+
static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
bool GlobalOnly, bool &SingleChoice) {
if (CurrentIdent == NextIdent)
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature --scrub-attributes
-; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa < %s | FileCheck %s
+; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa -openmp-hide-memory-transfer-latency < %s | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-; FIXME: This struct should be generated after splitting at least one of the runtime calls.
-; %struct.__tgt_async_info = type { i8* }
+; CHECK: %struct.__tgt_async_info = type { i8* }
%struct.ident_t = type { i32, i32, i32, i32, i8* }
%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
; CHECK-NEXT: %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
; CHECK-NEXT: %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
; CHECK-NEXT: store double* %a, double** %4, align 8
-; CHECK-NEXT: call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
+
+; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
+; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
; CHECK-NEXT: %5 = bitcast double* %a to i64*
; CHECK-NEXT: %6 = load i64, i64* %5, align 8
; CHECK-NEXT: %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs4, i64 0, i64 0
%3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
%4 = bitcast [1 x i8*]* %.offload_ptrs to double**
store double* %a, double** %4, align 8
- ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
- ; split into its "issue" and "wait" counterpars and moved upwards
- ; and downwards, respectively.
- ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
- ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
%5 = bitcast double* %a to i64*
; CHECK-NEXT: store i32* %size.addr, i32** %9, align 8
; CHECK-NEXT: %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
; CHECK-NEXT: store i64 4, i64* %10, align 8
-; CHECK-NEXT: call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
+
+; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
+; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
; CHECK-NEXT: %11 = load i32, i32* %size.addr, align 4
; CHECK-NEXT: %size.casted = zext i32 %11 to i64
; CHECK-NEXT: %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
store i32* %size.addr, i32** %9, align 8
%10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
store i64 4, i64* %10, align 8
- ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
- ; split into its "issue" and "wait" counterpars and moved upwards
- ; and downwards, respectively. Here though, the "issue" cannot be moved upwards
- ; because it's not guaranteed that rand() won't modify *a.
- ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
- ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
%11 = load i32, i32* %size.addr, align 4
; CHECK-NEXT: store i32* %size.addr, i32** %9, align 8
; CHECK-NEXT: %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
; CHECK-NEXT: store i64 4, i64* %10, align 8
-; CHECK-NEXT: call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
+
+; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
+; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
; CHECK-NEXT: %11 = load i32, i32* %size.addr, align 4
; CHECK-NEXT: %size.casted = zext i32 %11 to i64
; CHECK-NEXT: %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
store i32* %size.addr, i32** %9, align 8
%10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
store i64 4, i64* %10, align 8
- ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
- ; split into its "issue" and "wait" counterpars and moved upwards
- ; and downwards, respectively.
- ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
- ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
%11 = load i32, i32* %size.addr, align 4
; CHECK-NEXT: store double* %a, double** %4, align 8
; CHECK-NEXT: %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0
; CHECK-NEXT: store i64 %0, i64* %5, align 8
-; CHECK-NEXT: call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
+
+; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
+; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
; CHECK-NEXT: %rem = urem i32 %call, %size
; CHECK-NEXT: call void @__tgt_target_data_end_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
; CHECK-NEXT: ret i32 %rem
store double* %a, double** %4, align 8
%5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0
store i64 %0, i64* %5, align 8
- ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
- ; split into its "issue" and "wait" counterpars and moved upwards
- ; and downwards, respectively. Here though, the "wait" cannot be moved downwards
- ; because it is not worthit. That is, there is no store nor call to be hoisted
- ; over.
- ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
- ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
%rem = urem i32 %call, %size
declare dso_local i32 @rand(...)
-; FIXME: These two function declarations must be generated after splitting the runtime function
-; __tgt_target_data_begin_mapper.
-; declare %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64, i32, i8**, i8**, i64*, i64*, i8**)
-; declare void @__tgt_target_data_begin_mapper_wait(i64, %struct.__tgt_async_info)
+; CHECK: declare %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64, i32, i8**, i8**, i64*, i64*, i8**)
+; CHECK: declare void @__tgt_target_data_begin_mapper_wait(i64, %struct.__tgt_async_info)