GPGPU: Store back non-read-only scalars

author Tobias Grosser <tobias@grosser.es>

Sat, 17 Sep 2016 19:22:31 +0000 (19:22 +0000)

committer Tobias Grosser <tobias@grosser.es>

Sat, 17 Sep 2016 19:22:31 +0000 (19:22 +0000)
author Tobias Grosser <tobias@grosser.es>
Sat, 17 Sep 2016 19:22:31 +0000 (19:22 +0000)
committer Tobias Grosser <tobias@grosser.es>
Sat, 17 Sep 2016 19:22:31 +0000 (19:22 +0000)
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp

index 3466b23..6b33db5 100644 (file)
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -379,6 +379,15 @@ private:
    /// @returns The Assembly string of the kernel.
    std::string finalizeKernelFunction();
  
+  /// Finalize the generation of the kernel arguments.
+  ///
+  /// This function ensures that not-read-only scalars used in a kernel are
+  /// stored back to the global memory location they ared backed up with before
+  /// the kernel terminates.
+  ///
+  /// @params Kernel The kernel to finalize kernel arguments for.
+  void finalizeKernelArguments(ppcg_kernel *Kernel);
+
    /// Create code that allocates memory to store arrays on device.
    void allocateDeviceArrays();
  
@@ -1198,13 +1207,13 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
  
    create(isl_ast_node_copy(Kernel->tree));
  
+  finalizeKernelArguments(Kernel);
    Function *F = Builder.GetInsertBlock()->getParent();
    addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
    clearDominators(F);
    clearScalarEvolution(F);
    clearLoops(F);
  
-  Builder.SetInsertPoint(&HostInsertPoint);
    IDToValue = HostIDs;
  
    ValueMap = std::move(HostValueMap);
@@ -1217,9 +1226,10 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
      S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array);
    LocalArrays.clear();
  
+  std::string ASMString = finalizeKernelFunction();
+  Builder.SetInsertPoint(&HostInsertPoint);
    Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues);
  
-  std::string ASMString = finalizeKernelFunction();
    std::string Name = "kernel_" + std::to_string(Kernel->id);
    Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
    Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
@@ -1410,6 +1420,49 @@ void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) {
    }
  }
  
+void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) {
+  auto *FN = Builder.GetInsertBlock()->getParent();
+  auto Arg = FN->arg_begin();
+
+  bool StoredScalar = false;
+  for (long i = 0; i < Kernel->n_array; i++) {
+    if (!ppcg_kernel_requires_array_argument(Kernel, i))
+      continue;
+
+    isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
+    const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id));
+    isl_id_free(Id);
+
+    if (SAI->getNumberOfDimensions() > 0) {
+      Arg++;
+      continue;
+    }
+
+    if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
+      Arg++;
+      continue;
+    }
+
+    Value *Alloca = BlockGen.getOrCreateAlloca(SAI);
+    Value *ArgPtr = &*Arg;
+    Type *TypePtr = SAI->getElementType()->getPointerTo();
+    Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr);
+    Value *Val = Builder.CreateLoad(Alloca);
+    Builder.CreateStore(Val, TypedArgPtr);
+    StoredScalar = true;
+
+    Arg++;
+  }
+
+  if (StoredScalar)
+    /// In case more than one thread contains scalar stores, the generated
+    /// code might be incorrect, if we only store at the end of the kernel.
+    /// To support this case we need to store these scalars back at each
+    /// memory store or at least before each kernel barrier.
+    if (Kernel->n_block != 0 || Kernel->n_grid != 0)
+      BuildSuccessful = 0;
+}
+
  void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {
    Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  
diff --git a/polly/test/GPGPU/non-read-only-scalars.ll b/polly/test/GPGPU/non-read-only-scalars.ll

new file mode 100644 (file)

index 0000000..bebf021
--- /dev/null
+++ b/polly/test/GPGPU/non-read-only-scalars.ll
@@ -0,0 +1,176 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=CODE %s
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck %s -check-prefix=KERNEL-IR
+;
+; REQUIRES: pollyacc
+;
+; #include <stdio.h>
+;
+; float foo(float A[]) {
+;   float sum = 0;
+;
+;   for (long i = 0; i < 32; i++)
+;     A[i] = i;
+;
+;   for (long i = 0; i < 32; i++)
+;     A[i] += i;
+;
+;   for (long i = 0; i < 32; i++)
+;     sum += A[i];
+;
+;   return sum;
+; }
+;
+; int main() {
+;   float A[32];
+;   float sum = foo(A);
+;   printf("%f\n", sum);
+; }
+
+; CODE: Code
+; CODE-NEXT: ====
+; CODE-NEXT: # host
+; CODE-NEXT: {
+; CODE-NEXT:   {
+; CODE-NEXT:     dim3 k0_dimBlock(32);
+; CODE-NEXT:     dim3 k0_dimGrid(1);
+; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
+; CODE-NEXT:     cudaCheckKernel();
+; CODE-NEXT:   }
+
+; CODE:   {
+; CODE-NEXT:     dim3 k1_dimBlock;
+; CODE-NEXT:     dim3 k1_dimGrid;
+; CODE-NEXT:     kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_sum_0__phi);
+; CODE-NEXT:     cudaCheckKernel();
+; CODE-NEXT:   }
+
+; CODE:   for (int c0 = 0; c0 <= 32; c0 += 1) {
+; CODE-NEXT:     {
+; CODE-NEXT:       dim3 k2_dimBlock;
+; CODE-NEXT:       dim3 k2_dimGrid;
+; CODE-NEXT:       kernel2 <<<k2_dimGrid, k2_dimBlock>>> (dev_MemRef_sum_0__phi, dev_MemRef_sum_0, c0);
+; CODE-NEXT:       cudaCheckKernel();
+; CODE-NEXT:     }
+
+; CODE:     if (c0 <= 31)
+; CODE-NEXT:       {
+; CODE-NEXT:         dim3 k3_dimBlock;
+; CODE-NEXT:         dim3 k3_dimGrid;
+; CODE-NEXT:         kernel3 <<<k3_dimGrid, k3_dimBlock>>> (dev_MemRef_A, dev_MemRef_sum_0__phi, dev_MemRef_sum_0, c0);
+; CODE-NEXT:         cudaCheckKernel();
+; CODE-NEXT:       }
+
+; CODE:   }
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (32) * sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(&MemRef_sum_0__phi, dev_MemRef_sum_0__phi, sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(&MemRef_sum_0, dev_MemRef_sum_0, sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT: }
+
+; CODE: # kernel0
+; CODE-NEXT: {
+; CODE-NEXT:   Stmt_bb4(t0);
+; CODE-NEXT:   Stmt_bb10(t0);
+; CODE-NEXT: }
+
+; CODE: # kernel1
+; CODE-NEXT: Stmt_bb17();
+
+; CODE: # kernel2
+; CODE-NEXT: Stmt_bb18(c0);
+
+; CODE: # kernel3
+; CODE-NEXT: Stmt_bb20(c0);
+
+; KERNEL-IR:       store float %p_tmp23, float* %sum.0.phiops
+; KERNEL-IR-NEXT:  [[REGA:%.+]] = bitcast i8* %MemRef_sum_0__phi to float*
+; KERNEL-IR-NEXT:  [[REGB:%.+]] = load float, float* %sum.0.phiops
+; KERNEL-IR-NEXT:  store float [[REGB]], float* [[REGA]]
+; KERNEL-IR-NEXT:  [[REGC:%.+]] = bitcast i8* %MemRef_sum_0 to float*
+; KERNEL-IR-NEXT:  [[REGD:%.+]] = load float, float* %sum.0.s2a
+; KERNEL-IR-NEXT:  store float [[REGD]], float* [[REGC]]
+; KERNEL-IR-NEXT:  ret void
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@.str = private unnamed_addr constant [4 x i8] c"%f\0A\00", align 1
+
+define float @foo(float* %A) {
+bb:
+  br label %bb3
+
+bb3:                                              ; preds = %bb6, %bb
+  %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
+  %exitcond2 = icmp ne i64 %i.0, 32
+  br i1 %exitcond2, label %bb4, label %bb8
+
+bb4:                                              ; preds = %bb3
+  %tmp = sitofp i64 %i.0 to float
+  %tmp5 = getelementptr inbounds float, float* %A, i64 %i.0
+  store float %tmp, float* %tmp5, align 4
+  br label %bb6
+
+bb6:                                              ; preds = %bb4
+  %tmp7 = add nuw nsw i64 %i.0, 1
+  br label %bb3
+
+bb8:                                              ; preds = %bb3
+  br label %bb9
+
+bb9:                                              ; preds = %bb15, %bb8
+  %i1.0 = phi i64 [ 0, %bb8 ], [ %tmp16, %bb15 ]
+  %exitcond1 = icmp ne i64 %i1.0, 32
+  br i1 %exitcond1, label %bb10, label %bb17
+
+bb10:                                             ; preds = %bb9
+  %tmp11 = sitofp i64 %i1.0 to float
+  %tmp12 = getelementptr inbounds float, float* %A, i64 %i1.0
+  %tmp13 = load float, float* %tmp12, align 4
+  %tmp14 = fadd float %tmp13, %tmp11
+  store float %tmp14, float* %tmp12, align 4
+  br label %bb15
+
+bb15:                                             ; preds = %bb10
+  %tmp16 = add nuw nsw i64 %i1.0, 1
+  br label %bb9
+
+bb17:                                             ; preds = %bb9
+  br label %bb18
+
+bb18:                                             ; preds = %bb20, %bb17
+  %sum.0 = phi float [ 0.000000e+00, %bb17 ], [ %tmp23, %bb20 ]
+  %i2.0 = phi i64 [ 0, %bb17 ], [ %tmp24, %bb20 ]
+  %exitcond = icmp ne i64 %i2.0, 32
+  br i1 %exitcond, label %bb19, label %bb25
+
+bb19:                                             ; preds = %bb18
+  br label %bb20
+
+bb20:                                             ; preds = %bb19
+  %tmp21 = getelementptr inbounds float, float* %A, i64 %i2.0
+  %tmp22 = load float, float* %tmp21, align 4
+  %tmp23 = fadd float %sum.0, %tmp22
+  %tmp24 = add nuw nsw i64 %i2.0, 1
+  br label %bb18
+
+bb25:                                             ; preds = %bb18
+  %sum.0.lcssa = phi float [ %sum.0, %bb18 ]
+  ret float %sum.0.lcssa
+}
+
+define i32 @main() {
+bb:
+  %A = alloca [32 x float], align 16
+  %tmp = getelementptr inbounds [32 x float], [32 x float]* %A, i64 0, i64 0
+  %tmp1 = call float @foo(float* %tmp)
+  %tmp2 = fpext float %tmp1 to double
+  %tmp3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double %tmp2) #2
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...) #1
+
author	Tobias Grosser <tobias@grosser.es>
	Sat, 17 Sep 2016 19:22:31 +0000 (19:22 +0000)
committer	Tobias Grosser <tobias@grosser.es>
	Sat, 17 Sep 2016 19:22:31 +0000 (19:22 +0000)
polly/lib/CodeGen/PPCGCodeGeneration.cpp		patch \| blob \| history
polly/test/GPGPU/non-read-only-scalars.ll	[new file with mode: 0644]	patch \| blob