[OpenMPOpt][HideMemTransfersLatency] Split __tgt_target_data_begin_mapper into its...

author Hamilton Tobon Mosquera <Hamilton Tobon>

Tue, 18 Aug 2020 01:18:21 +0000 (20:18 -0500)

committer Hamilton Tobon Mosquera <Hamilton Tobon>

Tue, 18 Aug 2020 01:56:10 +0000 (20:56 -0500)
author Hamilton Tobon Mosquera <Hamilton Tobon>
Tue, 18 Aug 2020 01:18:21 +0000 (20:18 -0500)
committer Hamilton Tobon Mosquera <Hamilton Tobon>
Tue, 18 Aug 2020 01:56:10 +0000 (20:56 -0500)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def

index 3fc87dc..9ad7eff 100644 (file)
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -198,6 +198,7 @@ __OMP_ARRAY_TYPE(KmpCriticalName, Int32, 8)
    OMP_STRUCT_TYPE(VarName, "struct." #Name, __VA_ARGS__)
  
  __OMP_STRUCT_TYPE(Ident, ident_t, Int32, Int32, Int32, Int32, Int8Ptr)
+__OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, Int8Ptr)
  
  #undef __OMP_STRUCT_TYPE
  #undef OMP_STRUCT_TYPE
@@ -482,6 +483,9 @@ __OMP_RTL(__tgt_target_data_begin_mapper, false, Void, Int64, Int32, VoidPtrPtr,
            VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr)
  __OMP_RTL(__tgt_target_data_begin_nowait_mapper, false, Void, Int64, Int32,
            VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr)
+__OMP_RTL(__tgt_target_data_begin_mapper_issue, false, AsyncInfo, Int64, Int32,
+          VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr)
+__OMP_RTL(__tgt_target_data_begin_mapper_wait, false, Void, Int64, AsyncInfo)
  __OMP_RTL(__tgt_target_data_end_mapper, false, Void, Int64, Int32, VoidPtrPtr,
            VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr)
  __OMP_RTL(__tgt_target_data_end_nowait_mapper, false, Void, Int64, Int32,
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp

index 93f1e53..ae7bafd 100644 (file)
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -42,6 +42,13 @@ static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
  static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
                                          cl::init(false), cl::Hidden);
  
+static cl::opt<bool> HideMemoryTransferLatency(
+    "openmp-hide-memory-transfer-latency",
+    cl::desc("[WIP] Tries to hide the latency of host to device memory"
+             " transfers"),
+    cl::Hidden, cl::init(false));
+
+
  STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
            "Number of OpenMP runtime calls deduplicated");
  STATISTIC(NumOpenMPParallelRegionsDeleted,
@@ -508,6 +515,8 @@ struct OpenMPOpt {
  
      Changed |= deduplicateRuntimeCalls();
      Changed |= deleteParallelRegions();
+    if (HideMemoryTransferLatency)
+      Changed |= hideMemTransfersLatency();
  
      return Changed;
    }
@@ -666,6 +675,63 @@ private:
      return Changed;
    }
  
+  /// Tries to hide the latency of runtime calls that involve host to
+  /// device memory transfers by splitting them into their "issue" and "wait"
+  /// versions. The "issue" is moved upwards as much as possible. The "wait" is
+  /// moved downards as much as possible. The "issue" issues the memory transfer
+  /// asynchronously, returning a handle. The "wait" waits in the returned
+  /// handle for the memory transfer to finish.
+  bool hideMemTransfersLatency() {
+    auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
+    bool Changed = false;
+    auto SplitMemTransfers = [&](Use &U, Function &Decl) {
+      auto *RTCall = getCallIfRegularCall(U, &RFI);
+      if (!RTCall)
+        return false;
+
+      bool WasSplit = splitTargetDataBeginRTC(RTCall);
+      Changed |= WasSplit;
+      return WasSplit;
+    };
+    RFI.foreachUse(SCC, SplitMemTransfers);
+
+    return Changed;
+  }
+
+  /// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
+  bool splitTargetDataBeginRTC(CallInst *RuntimeCall) {
+    auto &IRBuilder = OMPInfoCache.OMPBuilder;
+    // Add "issue" runtime call declaration:
+    // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
+    //   i8**, i8**, i64*, i64*)
+    FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
+        M, OMPRTL___tgt_target_data_begin_mapper_issue);
+
+    // Change RuntimeCall call site for its asynchronous version.
+    SmallVector<Value *, 8> Args;
+    for (auto &Arg : RuntimeCall->args())
+      Args.push_back(Arg.get());
+
+    CallInst *IssueCallsite =
+        CallInst::Create(IssueDecl, Args, "handle", RuntimeCall);
+    RuntimeCall->eraseFromParent();
+
+    // Add "wait" runtime call declaration:
+    // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
+    FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
+        M, OMPRTL___tgt_target_data_begin_mapper_wait);
+
+    // Add call site to WaitDecl.
+    Value *WaitParams[2] = {
+        IssueCallsite->getArgOperand(0), // device_id.
+        IssueCallsite // returned handle.
+    };
+    CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"",
+                     IssueCallsite->getNextNode());
+
+    return true;
+  }
+
    static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
                                      bool GlobalOnly, bool &SingleChoice) {
      if (CurrentIdent == NextIdent)
diff --git a/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll b/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll

index daebe4b..7f55ad1 100644 (file)
--- a/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
+++ b/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
@@ -1,9 +1,8 @@
  ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature --scrub-attributes
-; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa < %s | FileCheck %s
+; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa -openmp-hide-memory-transfer-latency < %s | FileCheck %s
  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
  
-; FIXME: This struct should be generated after splitting at least one of the runtime calls.
-; %struct.__tgt_async_info = type { i8* }
+; CHECK: %struct.__tgt_async_info = type { i8* }
  
  %struct.ident_t = type { i32, i32, i32, i32, i8* }
  %struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
@@ -58,7 +57,10 @@ define dso_local double @heavyComputation1() {
  ; CHECK-NEXT:    %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
  ; CHECK-NEXT:    %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
  ; CHECK-NEXT:    store double* %a, double** %4, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
+
+; CHECK-NEXT:    %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
  ; CHECK-NEXT:    %5 = bitcast double* %a to i64*
  ; CHECK-NEXT:    %6 = load i64, i64* %5, align 8
  ; CHECK-NEXT:    %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs4, i64 0, i64 0
@@ -102,11 +104,6 @@ entry:
    %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
    %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
    store double* %a, double** %4, align 8
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively.
-  ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
-  ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
    call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
  
    %5 = bitcast double* %a to i64*
@@ -186,7 +183,10 @@ define dso_local i32 @heavyComputation2(double* %a, i32 %size) {
  ; CHECK-NEXT:    store i32* %size.addr, i32** %9, align 8
  ; CHECK-NEXT:    %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
  ; CHECK-NEXT:    store i64 4, i64* %10, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
+
+; CHECK-NEXT:    %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
  ; CHECK-NEXT:    %11 = load i32, i32* %size.addr, align 4
  ; CHECK-NEXT:    %size.casted = zext i32 %11 to i64
  ; CHECK-NEXT:    %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
@@ -241,12 +241,6 @@ entry:
    store i32* %size.addr, i32** %9, align 8
    %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
    store i64 4, i64* %10, align 8
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively. Here though, the "issue" cannot be moved upwards
-  ;        because it's not guaranteed that rand() won't modify *a.
-  ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
-  ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
    call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
  
    %11 = load i32, i32* %size.addr, align 4
@@ -330,7 +324,10 @@ define dso_local i32 @heavyComputation3(double* noalias %a, i32 %size) {
  ; CHECK-NEXT:    store i32* %size.addr, i32** %9, align 8
  ; CHECK-NEXT:    %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
  ; CHECK-NEXT:    store i64 4, i64* %10, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
+
+; CHECK-NEXT:    %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
  ; CHECK-NEXT:    %11 = load i32, i32* %size.addr, align 4
  ; CHECK-NEXT:    %size.casted = zext i32 %11 to i64
  ; CHECK-NEXT:    %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
@@ -386,11 +383,6 @@ entry:
    store i32* %size.addr, i32** %9, align 8
    %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
    store i64 4, i64* %10, align 8
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively.
-  ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
-  ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
    call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
  
    %11 = load i32, i32* %size.addr, align 4
@@ -459,7 +451,10 @@ define dso_local i32 @dataTransferOnly1(double* noalias %a, i32 %size) {
  ; CHECK-NEXT:    store double* %a, double** %4, align 8
  ; CHECK-NEXT:    %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0
  ; CHECK-NEXT:    store i64 %0, i64* %5, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
+
+; CHECK-NEXT:    %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
  ; CHECK-NEXT:    %rem = urem i32 %call, %size
  ; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
  ; CHECK-NEXT:    ret i32 %rem
@@ -482,13 +477,6 @@ entry:
    store double* %a, double** %4, align 8
    %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0
    store i64 %0, i64* %5, align 8
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively. Here though, the "wait" cannot be moved downwards
-  ;        because it is not worthit. That is, there is no store nor call to be hoisted
-  ;        over.
-  ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
-  ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
    call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
  
    %rem = urem i32 %call, %size
@@ -503,7 +491,5 @@ declare void @__tgt_target_data_end_mapper(i64, i32, i8**, i8**, i64*, i64*, i8*
  
  declare dso_local i32 @rand(...)
  
-; FIXME: These two function declarations must be generated after splitting the runtime function
-;        __tgt_target_data_begin_mapper.
-; declare %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64, i32, i8**, i8**, i64*, i64*, i8**)
-; declare void @__tgt_target_data_begin_mapper_wait(i64, %struct.__tgt_async_info)
+; CHECK: declare %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64, i32, i8**, i8**, i64*, i64*, i8**)
+; CHECK: declare void @__tgt_target_data_begin_mapper_wait(i64, %struct.__tgt_async_info)
author	Hamilton Tobon Mosquera <Hamilton Tobon>
	Tue, 18 Aug 2020 01:18:21 +0000 (20:18 -0500)
committer	Hamilton Tobon Mosquera <Hamilton Tobon>
	Tue, 18 Aug 2020 01:56:10 +0000 (20:56 -0500)
llvm/include/llvm/Frontend/OpenMP/OMPKinds.def		patch \| blob \| history
llvm/lib/Transforms/IPO/OpenMPOpt.cpp		patch \| blob \| history
llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll		patch \| blob \| history