GPGPU: Do not assume arrays start at 0

author Tobias Grosser <tobias@grosser.es>

Thu, 15 Sep 2016 14:05:58 +0000 (14:05 +0000)

committer Tobias Grosser <tobias@grosser.es>

Thu, 15 Sep 2016 14:05:58 +0000 (14:05 +0000)
author Tobias Grosser <tobias@grosser.es>
Thu, 15 Sep 2016 14:05:58 +0000 (14:05 +0000)
committer Tobias Grosser <tobias@grosser.es>
Thu, 15 Sep 2016 14:05:58 +0000 (14:05 +0000)
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp

index 076a94a..48444d6 100644 (file)
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -284,6 +284,21 @@ private:
    /// @param Array The array for which to compute a size.
    Value *getArraySize(gpu_array_info *Array);
  
+  /// Generate code to compute the minimal offset at which an array is accessed.
+  ///
+  /// The offset of an array is the minimal array location accessed in a scop.
+  ///
+  /// Example:
+  ///
+  ///   for (long i = 0; i < 100; i++)
+  ///     A[i + 42] += ...
+  ///
+  ///   getArrayOffset(A) results in 42.
+  ///
+  /// @param Array The array for which to compute the offset.
+  /// @returns An llvm::Value that contains the offset of the array.
+  Value *getArrayOffset(gpu_array_info *Array);
+
    /// Prepare the kernel arguments for kernel code generation
    ///
    /// @param Kernel The kernel to generate code for.
@@ -468,6 +483,12 @@ void GPUNodeBuilder::allocateDeviceArrays() {
      DevArrayName.append(Array->name);
  
      Value *ArraySize = getArraySize(Array);
+    Value *Offset = getArrayOffset(Array);
+    if (Offset)
+      ArraySize = Builder.CreateSub(
+          ArraySize,
+          Builder.CreateMul(Offset,
+                            Builder.getInt64(ScopArray->getElemSizeInBytes())));
      Value *DevArray = createCallAllocateMemoryForDevice(ArraySize);
      DevArray->setName(DevArrayName);
      DeviceAllocations[ScopArray] = DevArray;
@@ -721,6 +742,48 @@ Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) {
    return ArraySize;
  }
  
+Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) {
+  if (gpu_array_is_scalar(Array))
+    return nullptr;
+
+  isl_ast_build *Build = isl_ast_build_from_context(S.getContext());
+
+  isl_set *Min = isl_set_lexmin(isl_set_copy(Array->extent));
+
+  isl_set *ZeroSet = isl_set_universe(isl_set_get_space(Min));
+
+  for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++)
+    ZeroSet = isl_set_fix_si(ZeroSet, isl_dim_set, i, 0);
+
+  if (isl_set_is_subset(Min, ZeroSet)) {
+    isl_set_free(Min);
+    isl_set_free(ZeroSet);
+    isl_ast_build_free(Build);
+    return nullptr;
+  }
+  isl_set_free(ZeroSet);
+
+  isl_ast_expr *Result =
+      isl_ast_expr_from_val(isl_val_int_from_si(isl_set_get_ctx(Min), 0));
+
+  for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) {
+    if (i > 0) {
+      isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i - 1]);
+      isl_ast_expr *BExpr = isl_ast_build_expr_from_pw_aff(Build, Bound_I);
+      Result = isl_ast_expr_mul(Result, BExpr);
+    }
+    isl_pw_aff *DimMin = isl_set_dim_min(isl_set_copy(Min), i);
+    isl_ast_expr *MExpr = isl_ast_build_expr_from_pw_aff(Build, DimMin);
+    Result = isl_ast_expr_add(Result, MExpr);
+  }
+
+  Value *ResultValue = ExprBuilder.create(Result);
+  isl_set_free(Min);
+  isl_ast_build_free(Build);
+
+  return ResultValue;
+}
+
  void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
                                          enum DataDirection Direction) {
    isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt);
@@ -730,6 +793,7 @@ void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
    auto ScopArray = (ScopArrayInfo *)(Array->user);
  
    Value *Size = getArraySize(Array);
+  Value *Offset = getArrayOffset(Array);
    Value *DevPtr = DeviceAllocations[ScopArray];
  
    Value *HostPtr;
@@ -739,8 +803,20 @@ void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
    else
      HostPtr = ScopArray->getBasePtr();
  
+  if (Offset) {
+    HostPtr = Builder.CreatePointerCast(
+        HostPtr, ScopArray->getElementType()->getPointerTo());
+    HostPtr = Builder.CreateGEP(HostPtr, Offset);
+  }
+
    HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
  
+  if (Offset) {
+    Size = Builder.CreateSub(
+        Size, Builder.CreateMul(
+                  Offset, Builder.getInt64(ScopArray->getElemSizeInBytes())));
+  }
+
    if (Direction == HOST_TO_DEVICE)
      createCallCopyFromHostToDevice(HostPtr, DevPtr, Size);
    else
@@ -1000,6 +1076,16 @@ GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F,
  
      Value *DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];
      DevArray = createCallGetDevicePtr(DevArray);
+
+    Value *Offset = getArrayOffset(&Prog->array[i]);
+
+    if (Offset) {
+      DevArray = Builder.CreatePointerCast(
+          DevArray, SAI->getElementType()->getPointerTo());
+      DevArray = Builder.CreateGEP(DevArray, Builder.CreateNeg(Offset));
+      DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy());
+    }
+
      Instruction *Param = new AllocaInst(
          Builder.getInt8PtrTy(), Launch + "_param_" + std::to_string(Index),
          EntryBlock->getTerminator());
diff --git a/polly/test/GPGPU/non-zero-array-offset.ll b/polly/test/GPGPU/non-zero-array-offset.ll

new file mode 100644 (file)

index 0000000..b3c1d91
--- /dev/null
+++ b/polly/test/GPGPU/non-zero-array-offset.ll
@@ -0,0 +1,124 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=CODE %s
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
+; RUN: FileCheck %s -check-prefix=IR
+;
+; REQUIRES: pollyacc
+
+; CODE: Code
+; CODE-NEXT: ====
+; CODE-NEXT: # host
+; CODE-NEXT: {
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (16) * sizeof(float), cudaMemcpyHostToDevice));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (8) * sizeof(float), cudaMemcpyHostToDevice));
+; CODE-NEXT:   {
+; CODE-NEXT:     dim3 k0_dimBlock(8);
+; CODE-NEXT:     dim3 k0_dimGrid(1);
+; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B);
+; CODE-NEXT:     cudaCheckKernel();
+; CODE-NEXT:   }
+
+; CODE:   {
+; CODE-NEXT:     dim3 k1_dimBlock(8);
+; CODE-NEXT:     dim3 k1_dimGrid(1);
+; CODE-NEXT:     kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A);
+; CODE-NEXT:     cudaCheckKernel();
+; CODE-NEXT:   }
+
+; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (16) * sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (8) * sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT: }
+
+; CODE: # kernel0
+; CODE-NEXT: Stmt_bb3(t0);
+
+; CODE: # kernel1
+; CODE-NEXT: Stmt_bb11(t0);
+
+; IR:       %p_dev_array_MemRef_B = call i8* @polly_allocateMemoryForDevice(i64 32)
+; IR-NEXT:  %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 32)
+; IR-NEXT:  [[REG0:%.+]] = getelementptr float, float* %B, i64 8
+; IR-NEXT:  [[REG1:%.+]] = bitcast float* [[REG0]] to i8*
+; IR-NEXT:  call void @polly_copyFromHostToDevice(i8* [[REG1]], i8* %p_dev_array_MemRef_B, i64 32)
+
+; IR:      [[REGA:%.+]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
+; IR-NEXT: [[REGB:%.+]]  = bitcast i8* [[REGA]] to float*
+; IR-NEXT: [[REGC:%.+]]  = getelementptr float, float* [[REGB]], i64 -8
+; IR-NEXT: [[REGD:%.+]]  = bitcast float* [[REGC]] to i8*
+
+;    void foo(float A[], float B[]) {
+;      for (long i = 0; i < 8; i++)
+;        B[i + 8] *= 4;
+;
+;      for (long i = 0; i < 8; i++)
+;        A[i] *= 12;
+;    }
+;
+;    #ifdef OUTPUT
+;    int main() {
+;      float A[16];
+;
+;      for (long i = 0; i < 16; i++) {
+;        __sync_synchronize();
+;        A[i] = i;
+;      }
+;
+;      foo(A, A);
+;
+;      float sum = 0;
+;      for (long i = 0; i < 16; i++) {
+;        __sync_synchronize();
+;        sum += A[i];
+;      }
+;
+;      printf("%f\n", sum);
+;    }
+;    #endif
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(float* %A, float* %B) {
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb7, %bb
+  %i.0 = phi i64 [ 0, %bb ], [ %tmp8, %bb7 ]
+  %exitcond1 = icmp ne i64 %i.0, 8
+  br i1 %exitcond1, label %bb3, label %bb9
+
+bb3:                                              ; preds = %bb2
+  %tmp = add nuw nsw i64 %i.0, 8
+  %tmp4 = getelementptr inbounds float, float* %B, i64 %tmp
+  %tmp5 = load float, float* %tmp4, align 4
+  %tmp6 = fmul float %tmp5, 4.000000e+00
+  store float %tmp6, float* %tmp4, align 4
+  br label %bb7
+
+bb7:                                              ; preds = %bb3
+  %tmp8 = add nuw nsw i64 %i.0, 1
+  br label %bb2
+
+bb9:                                              ; preds = %bb2
+  br label %bb10
+
+bb10:                                             ; preds = %bb15, %bb9
+  %i1.0 = phi i64 [ 0, %bb9 ], [ %tmp16, %bb15 ]
+  %exitcond = icmp ne i64 %i1.0, 8
+  br i1 %exitcond, label %bb11, label %bb17
+
+bb11:                                             ; preds = %bb10
+  %tmp12 = getelementptr inbounds float, float* %A, i64 %i1.0
+  %tmp13 = load float, float* %tmp12, align 4
+  %tmp14 = fmul float %tmp13, 1.200000e+01
+  store float %tmp14, float* %tmp12, align 4
+  br label %bb15
+
+bb15:                                             ; preds = %bb11
+  %tmp16 = add nuw nsw i64 %i1.0, 1
+  br label %bb10
+
+bb17:                                             ; preds = %bb10
+  ret void
+}
diff --git a/polly/test/GPGPU/phi-nodes-in-kernel.ll b/polly/test/GPGPU/phi-nodes-in-kernel.ll

index 5befa36..f367096 100644 (file)
--- a/polly/test/GPGPU/phi-nodes-in-kernel.ll
+++ b/polly/test/GPGPU/phi-nodes-in-kernel.ll
@@ -38,13 +38,13 @@ target triple = "x86_64-unknown-linux-gnu"
  ; CODE-NEXT:     Stmt_for_cond15_for_cond12_loopexit_crit_edge(0);
  ; CODE-NEXT: }
  
-; IR:      %1 = bitcast i32* %out_l.055.phiops to i8*
-; IR-NEXT: call void @polly_copyFromHostToDevice(i8* %1, i8* %p_dev_array_MemRef_out_l_055__phi, i64 4)
+; IR:      [[REGA:%.+]] = bitcast i32* %out_l.055.phiops to i8*
+; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[REGA]], i8* %p_dev_array_MemRef_out_l_055__phi, i64 4)
  
-; IR:      %14 = bitcast i32* %out_l.055.phiops to i8*
-; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055__phi, i8* %14, i64 4)
-; IR-NEXT: %15 = bitcast i32* %out_l.055.s2a to i8*
-; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* %15, i64 4)
+; IR:      [[REGB:%.+]] = bitcast i32* %out_l.055.phiops to i8*
+; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055__phi, i8* [[REGB]], i64 4)
+; IR-NEXT: [[REGC:%.+]] = bitcast i32* %out_l.055.s2a to i8*
+; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* [[REGC]], i64 4)
  
  ; KERNEL-IR: entry:
  ; KERNEL-IR-NEXT:   %out_l.055.s2a = alloca i32
author	Tobias Grosser <tobias@grosser.es>
	Thu, 15 Sep 2016 14:05:58 +0000 (14:05 +0000)
committer	Tobias Grosser <tobias@grosser.es>
	Thu, 15 Sep 2016 14:05:58 +0000 (14:05 +0000)
polly/lib/CodeGen/PPCGCodeGeneration.cpp		patch \| blob \| history
polly/test/GPGPU/non-zero-array-offset.ll	[new file with mode: 0644]	patch \| blob
polly/test/GPGPU/phi-nodes-in-kernel.ll		patch \| blob \| history