[GPGPU] Make sure managed arrays are prepared at the beginning of the scop

author Tobias Grosser <tobias@grosser.es>

Sun, 6 Aug 2017 11:10:38 +0000 (11:10 +0000)

committer Tobias Grosser <tobias@grosser.es>

Sun, 6 Aug 2017 11:10:38 +0000 (11:10 +0000)
author Tobias Grosser <tobias@grosser.es>
Sun, 6 Aug 2017 11:10:38 +0000 (11:10 +0000)
committer Tobias Grosser <tobias@grosser.es>
Sun, 6 Aug 2017 11:10:38 +0000 (11:10 +0000)
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp

index 5516fd9..efa6378 100644 (file)
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -441,13 +441,10 @@ private:
    /// @returns A tuple with grid sizes for X and Y dimension
    std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel);
  
-  /// Creates a array that can be sent to the kernel on the device using a
-  /// host pointer. This is required for managed memory, when we directly send
-  /// host pointers to the device.
+  /// Get the managed array pointer for sending host pointers to the device.
    /// \note
    /// This is to be used only with managed memory
-  Value *getOrCreateManagedDeviceArray(gpu_array_info *Array,
-                                       ScopArrayInfo *ArrayInfo);
+  Value *getManagedDeviceArray(gpu_array_info *Array, ScopArrayInfo *ArrayInfo);
  
    /// Compute the sizes of the thread blocks for a given kernel.
    ///
@@ -652,6 +649,9 @@ private:
    /// Create code that allocates memory to store arrays on device.
    void allocateDeviceArrays();
  
+  /// Create code to prepare the managed device pointers.
+  void prepareManagedDeviceArrays();
+
    /// Free all allocated device arrays.
    void freeDeviceArrays();
  
@@ -747,6 +747,8 @@ void GPUNodeBuilder::initializeAfterRTH() {
  
    if (!ManagedMemory)
      allocateDeviceArrays();
+  else
+    prepareManagedDeviceArrays();
  }
  
  void GPUNodeBuilder::finalize() {
@@ -783,6 +785,32 @@ void GPUNodeBuilder::allocateDeviceArrays() {
    isl_ast_build_free(Build);
  }
  
+void GPUNodeBuilder::prepareManagedDeviceArrays() {
+  assert(ManagedMemory &&
+         "Device array most only be prepared in managed-memory mode");
+  for (int i = 0; i < Prog->n_array; ++i) {
+    gpu_array_info *Array = &Prog->array[i];
+    ScopArrayInfo *ScopArray = (ScopArrayInfo *)Array->user;
+    Value *HostPtr;
+
+    if (gpu_array_is_scalar(Array))
+      HostPtr = BlockGen.getOrCreateAlloca(ScopArray);
+    else
+      HostPtr = ScopArray->getBasePtr();
+    HostPtr = getLatestValue(HostPtr);
+
+    Value *Offset = getArrayOffset(Array);
+    if (Offset) {
+      HostPtr = Builder.CreatePointerCast(
+          HostPtr, ScopArray->getElementType()->getPointerTo());
+      HostPtr = Builder.CreateGEP(HostPtr, Offset);
+    }
+
+    HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
+    DeviceAllocations[ScopArray] = HostPtr;
+  }
+}
+
  void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX,
                                          Value *BlockDimY, Value *BlockDimZ) {
    auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations");
@@ -1102,35 +1130,16 @@ Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) {
    return ExprBuilder.create(Result.release());
  }
  
-Value *GPUNodeBuilder::getOrCreateManagedDeviceArray(gpu_array_info *Array,
-                                                     ScopArrayInfo *ArrayInfo) {
-
+Value *GPUNodeBuilder::getManagedDeviceArray(gpu_array_info *Array,
+                                             ScopArrayInfo *ArrayInfo) {
    assert(ManagedMemory && "Only used when you wish to get a host "
                            "pointer for sending data to the kernel, "
                            "with managed memory");
    std::map<ScopArrayInfo *, Value *>::iterator it;
-  if ((it = DeviceAllocations.find(ArrayInfo)) != DeviceAllocations.end()) {
-    return it->second;
-  } else {
-    Value *HostPtr;
-
-    if (gpu_array_is_scalar(Array))
-      HostPtr = BlockGen.getOrCreateAlloca(ArrayInfo);
-    else
-      HostPtr = ArrayInfo->getBasePtr();
-    HostPtr = getLatestValue(HostPtr);
-
-    Value *Offset = getArrayOffset(Array);
-    if (Offset) {
-      HostPtr = Builder.CreatePointerCast(
-          HostPtr, ArrayInfo->getElementType()->getPointerTo());
-      HostPtr = Builder.CreateGEP(HostPtr, Offset);
-    }
-
-    HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
-    DeviceAllocations[ArrayInfo] = HostPtr;
-    return HostPtr;
-  }
+  it = DeviceAllocations.find(ArrayInfo);
+  assert(it != DeviceAllocations.end() &&
+         "Device array expected to be available");
+  return it->second;
  }
  
  void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
@@ -1577,8 +1586,8 @@ GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F,
  
      Value *DevArray = nullptr;
      if (ManagedMemory) {
-      DevArray = getOrCreateManagedDeviceArray(
-          &Prog->array[i], const_cast<ScopArrayInfo *>(SAI));
+      DevArray = getManagedDeviceArray(&Prog->array[i],
+                                       const_cast<ScopArrayInfo *>(SAI));
      } else {
        DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];
        DevArray = createCallGetDevicePtr(DevArray);
diff --git a/polly/test/GPGPU/cuda-managed-memory-simple.ll b/polly/test/GPGPU/cuda-managed-memory-simple.ll

index cc6ec52..39a1218 100644 (file)
--- a/polly/test/GPGPU/cuda-managed-memory-simple.ll
+++ b/polly/test/GPGPU/cuda-managed-memory-simple.ll
@@ -37,13 +37,13 @@
  
  ; CHECK:       %13 = call i8* @polly_initContextCUDA()
  ; CHECK-NEXT:  %14 = bitcast i32* %A to i8*
-; CHECK-NEXT:  %15 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 0
+; CHECK-NEXT:  %15 = bitcast i32* %R to i8*
+; CHECK-NEXT:  %16 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 0
  ; CHECK-NEXT:  store i8* %14, i8** %polly_launch_0_param_0
-; CHECK-NEXT:  %16 = bitcast i8** %polly_launch_0_param_0 to i8*
-; CHECK-NEXT:  store i8* %16, i8** %15
-; CHECK-NEXT:  %17 = bitcast i32* %R to i8*
+; CHECK-NEXT:  %17 = bitcast i8** %polly_launch_0_param_0 to i8*
+; CHECK-NEXT:  store i8* %17, i8** %16
  ; CHECK-NEXT:  %18 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1
-; CHECK-NEXT:  store i8* %17, i8** %polly_launch_0_param_1
+; CHECK-NEXT:  store i8* %15, i8** %polly_launch_0_param_1
  ; CHECK-NEXT:  %19 = bitcast i8** %polly_launch_0_param_1 to i8*
  ; CHECK-NEXT:  store i8* %19, i8** %18
  ; CHECK-NEXT:  store i32 4, i32* %polly_launch_0_param_size_0
diff --git a/polly/test/GPGPU/managed-pointers-preparation.ll b/polly/test/GPGPU/managed-pointers-preparation.ll

new file mode 100644 (file)

index 0000000..5a6c560
--- /dev/null
+++ b/polly/test/GPGPU/managed-pointers-preparation.ll
@@ -0,0 +1,105 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -polly-invariant-load-hoisting \
+; RUN: -S -polly-acc-codegen-managed-memory < %s | FileCheck %s
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -polly-invariant-load-hoisting \
+; RUN: -S -polly-acc-codegen-managed-memory -disable-output \
+; RUN: -polly-acc-dump-code < %s | FileCheck %s -check-prefix=CODE
+
+; CHECK: @polly_launchKernel
+; CHECK: @polly_launchKernel
+; CHECK: @polly_launchKernel
+; CHECK: @polly_launchKernel
+; CHECK: @polly_launchKernel
+; CHECK-NOT: @polly_launchKernel
+
+
+; CODE:  if (p_0_loaded_from___data_runcontrol_MOD_lmulti_layer == 0) {
+; CODE-NEXT:    {
+; CODE-NEXT:      dim3 k0_dimBlock;
+; CODE-NEXT:      dim3 k0_dimGrid;
+; CODE-NEXT:      kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef__pn__phi, p_0_loaded_from___data_runcontrol_MOD_lmulti_layer);
+; CODE-NEXT:      cudaCheckKernel();
+; CODE-NEXT:    }
+
+; CODE:  } else {
+; CODE-NEXT:    {
+; CODE-NEXT:      dim3 k1_dimBlock;
+; CODE-NEXT:      dim3 k1_dimGrid;
+; CODE-NEXT:      kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef__pn__phi, p_0_loaded_from___data_runcontrol_MOD_lmulti_layer);
+; CODE-NEXT:      cudaCheckKernel();
+; CODE-NEXT:    }
+
+; CHECK that this program is correctly code generated and does not result in
+; 'instruction does not dominate use' errors. At an earlier point, such errors
+; have been generated as the preparation of the managed memory pointers was
+; performed right before kernel0, which does not dominate all other kernels.
+; Now the preparation is performed at the very beginning of the scop.
+
+source_filename = "bugpoint-output-c78f41e.bc"
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+@__data_radiation_MOD_rad_csalbw = external global [10 x double], align 32
+@__data_radiation_MOD_coai = external global [168 x double], align 32
+@__data_runcontrol_MOD_lmulti_layer = external global i32
+
+; Function Attrs: nounwind uwtable
+define void @__radiation_interface_MOD_radiation_init() #0 {
+entry:
+  br label %"94"
+
+"94":                                             ; preds = %"97", %entry
+  br label %"95"
+
+"95":                                             ; preds = %"95", %"94"
+  br i1 undef, label %"97", label %"95"
+
+"97":                                             ; preds = %"95"
+  br i1 undef, label %"99", label %"94"
+
+"99":                                             ; preds = %"97"
+  br label %"102"
+
+"102":                                            ; preds = %"102", %"99"
+  %indvars.iv17 = phi i64 [ %indvars.iv.next18, %"102" ], [ 1, %"99" ]
+  %0 = getelementptr [168 x double], [168 x double]* @__data_radiation_MOD_coai, i64 0, i64 0
+  store double 1.000000e+00, double* %0, align 8
+  %1 = icmp eq i64 %indvars.iv17, 3
+  %indvars.iv.next18 = add nuw nsw i64 %indvars.iv17, 1
+  br i1 %1, label %"110", label %"102"
+
+"110":                                            ; preds = %"102"
+  %2 = load i32, i32* @__data_runcontrol_MOD_lmulti_layer, align 4, !range !0
+  %3 = icmp eq i32 %2, 0
+  br i1 %3, label %"112", label %"111"
+
+"111":                                            ; preds = %"110"
+  br label %"115"
+
+"112":                                            ; preds = %"110"
+  br label %"115"
+
+"115":                                            ; preds = %"112", %"111"
+  %.pn = phi double [ undef, %"112" ], [ undef, %"111" ]
+  %4 = fdiv double 1.000000e+00, %.pn
+  br label %"116"
+
+"116":                                            ; preds = %"116", %"115"
+  %indvars.iv = phi i64 [ %indvars.iv.next, %"116" ], [ 1, %"115" ]
+  %5 = add nsw i64 %indvars.iv, -1
+  %6 = fmul double %4, undef
+  %7 = getelementptr [10 x double], [10 x double]* @__data_radiation_MOD_rad_csalbw, i64 0, i64 %5
+  store double %6, double* %7, align 8
+  %8 = icmp eq i64 %indvars.iv, 10
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br i1 %8, label %return, label %"116"
+
+return:                                           ; preds = %"116"
+  ret void
+}
+
+attributes #0 = { nounwind uwtable }
+
+!0 = !{i32 0, i32 2}
author	Tobias Grosser <tobias@grosser.es>
	Sun, 6 Aug 2017 11:10:38 +0000 (11:10 +0000)
committer	Tobias Grosser <tobias@grosser.es>
	Sun, 6 Aug 2017 11:10:38 +0000 (11:10 +0000)
polly/lib/CodeGen/PPCGCodeGeneration.cpp		patch \| blob \| history
polly/test/GPGPU/cuda-managed-memory-simple.ll		patch \| blob \| history
polly/test/GPGPU/managed-pointers-preparation.ll	[new file with mode: 0644]	patch \| blob