GPGPU: Sort dimension sizes of multi-dimensional shared memory arrays correctly

author Tobias Grosser <tobias@grosser.es>

Fri, 5 Aug 2016 08:27:24 +0000 (08:27 +0000)

committer Tobias Grosser <tobias@grosser.es>

Fri, 5 Aug 2016 08:27:24 +0000 (08:27 +0000)
author Tobias Grosser <tobias@grosser.es>
Fri, 5 Aug 2016 08:27:24 +0000 (08:27 +0000)
committer Tobias Grosser <tobias@grosser.es>
Fri, 5 Aug 2016 08:27:24 +0000 (08:27 +0000)
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp

index d418300..fa37f09 100644 (file)
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -1292,11 +1292,17 @@ void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {
      Type *ArrayTy = EleTy;
      SmallVector<const SCEV *, 4> Sizes;
  
-    for (unsigned int j = 0; j < Var.array->n_index; ++j) {
+    for (unsigned int j = 1; j < Var.array->n_index; ++j) {
        isl_val *Val = isl_vec_get_element_val(Var.size, j);
        long Bound = isl_val_get_num_si(Val);
        isl_val_free(Val);
        Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound));
+    }
+
+    for (int j = Var.array->n_index - 1; j >= 0; --j) {
+      isl_val *Val = isl_vec_get_element_val(Var.size, j);
+      long Bound = isl_val_get_num_si(Val);
+      isl_val_free(Val);
        ArrayTy = ArrayType::get(ArrayTy, Bound);
      }
  
diff --git a/polly/test/GPGPU/shared-memory-two-dimensional.ll b/polly/test/GPGPU/shared-memory-two-dimensional.ll

new file mode 100644 (file)

index 0000000..4b4fcfe
--- /dev/null
+++ b/polly/test/GPGPU/shared-memory-two-dimensional.ll
@@ -0,0 +1,103 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -polly-acc-use-shared \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=CODE %s
+
+; RUN: opt %loadPolly -polly-codegen-ppcg \
+; RUN: -polly-acc-use-shared \
+; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \
+; RUN: FileCheck -check-prefix=KERNEL %s
+
+; REQUIRES: pollyacc
+
+;    void foo(float A[], float b[][8]) {
+;      for (long i = 0; i < 32; i++)
+;        for (long j = 0; j < 16; j++)
+;          for (long k = 0; k < 8; k++)
+;            A[i] += j * k * b[j][k];
+;    }
+
+
+; CODE:      # kernel0
+; CODE-NEXT: {
+; CODE-NEXT:   if (t0 <= 7)
+; CODE-NEXT:     for (int c0 = 0; c0 <= 15; c0 += 1)
+; CODE-NEXT:       read(c0, t0);
+; CODE-NEXT:   read(t0);
+; CODE-NEXT:   sync0();
+; CODE-NEXT:   for (int c3 = 0; c3 <= 15; c3 += 1)
+; CODE-NEXT:     for (int c4 = 0; c4 <= 7; c4 += 1)
+; CODE-NEXT:       Stmt_bb8(t0, c3, c4);
+; CODE-NEXT:   sync1();
+; CODE-NEXT:   write(t0);
+; CODE-NEXT: }
+
+; KERNEL: @shared_MemRef_b = internal addrspace(3) global [16 x [8 x float]] zeroinitializer, align 4
+
+; KERNEL:        %polly.access.mul.MemRef_b = mul nsw i64 %polly.indvar, 8
+; KERNEL-NEXT:   %polly.access.add.MemRef_b = add nsw i64 %polly.access.mul.MemRef_b, %t0
+; KERNEL-NEXT:   %polly.access.MemRef_b = getelementptr float, float* %polly.access.cast.MemRef_b, i64 %polly.access.add.MemRef_b
+; KERNEL-NEXT:   %shared.read = load float, float* %polly.access.MemRef_b
+; KERNEL-NEXT:   store float %shared.read, float addrspace(3)* %polly.access.shared_MemRef_b
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(float* %A, [8 x float]* %b) {
+bb:
+  br label %bb3
+
+bb3:                                              ; preds = %bb22, %bb
+  %i.0 = phi i64 [ 0, %bb ], [ %tmp23, %bb22 ]
+  %exitcond2 = icmp ne i64 %i.0, 32
+  br i1 %exitcond2, label %bb4, label %bb24
+
+bb4:                                              ; preds = %bb3
+  br label %bb5
+
+bb5:                                              ; preds = %bb19, %bb4
+  %j.0 = phi i64 [ 0, %bb4 ], [ %tmp20, %bb19 ]
+  %exitcond1 = icmp ne i64 %j.0, 16
+  br i1 %exitcond1, label %bb6, label %bb21
+
+bb6:                                              ; preds = %bb5
+  br label %bb7
+
+bb7:                                              ; preds = %bb16, %bb6
+  %k.0 = phi i64 [ 0, %bb6 ], [ %tmp17, %bb16 ]
+  %exitcond = icmp ne i64 %k.0, 8
+  br i1 %exitcond, label %bb8, label %bb18
+
+bb8:                                              ; preds = %bb7
+  %tmp = mul nuw nsw i64 %j.0, %k.0
+  %tmp9 = sitofp i64 %tmp to float
+  %tmp10 = getelementptr inbounds [8 x float], [8 x float]* %b, i64 %j.0, i64 %k.0
+  %tmp11 = load float, float* %tmp10, align 4
+  %tmp12 = fmul float %tmp9, %tmp11
+  %tmp13 = getelementptr inbounds float, float* %A, i64 %i.0
+  %tmp14 = load float, float* %tmp13, align 4
+  %tmp15 = fadd float %tmp14, %tmp12
+  store float %tmp15, float* %tmp13, align 4
+  br label %bb16
+
+bb16:                                             ; preds = %bb8
+  %tmp17 = add nuw nsw i64 %k.0, 1
+  br label %bb7
+
+bb18:                                             ; preds = %bb7
+  br label %bb19
+
+bb19:                                             ; preds = %bb18
+  %tmp20 = add nuw nsw i64 %j.0, 1
+  br label %bb5
+
+bb21:                                             ; preds = %bb5
+  br label %bb22
+
+bb22:                                             ; preds = %bb21
+  %tmp23 = add nuw nsw i64 %i.0, 1
+  br label %bb3
+
+bb24:                                             ; preds = %bb3
+  ret void
+}
author	Tobias Grosser <tobias@grosser.es>
	Fri, 5 Aug 2016 08:27:24 +0000 (08:27 +0000)
committer	Tobias Grosser <tobias@grosser.es>
	Fri, 5 Aug 2016 08:27:24 +0000 (08:27 +0000)
polly/lib/CodeGen/PPCGCodeGeneration.cpp		patch \| blob \| history
polly/test/GPGPU/shared-memory-two-dimensional.ll	[new file with mode: 0644]	patch \| blob