; REQUIRES: pollyacc
; CHECK: Stmt_bb5
-; CHECK: Domain :=
-; CHECK: { Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 };
-; CHECK: Schedule :=
-; CHECK: { Stmt_bb5[i0, i1] -> [i0, i1] };
-; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
-; CHECK: { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
-; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0]
-; CHECK: { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
+; CHECK-NEXT: Domain :=
+; CHECK-NEXT: { Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 };
+; CHECK-NEXT: Schedule :=
+; CHECK-NEXT: { Stmt_bb5[i0, i1] -> [i0, i1] };
+; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
+; CHECK-NEXT: { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
+; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0]
+; CHECK-NEXT: { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
; SCHED: domain: "{ Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 }"
-; SCHED: child:
-; SCHED: context: "{ [] }"
-; SCHED: child:
-; SCHED: extension: "{ }"
-; SCHED: child:
-; SCHED: sequence:
-; SCHED: - filter: "{ }"
-; SCHED: - filter: "{ Stmt_bb5[i0, i1] }"
-; SCHED: child:
-; SCHED: guard: "{ [] }"
-; SCHED: child:
-; SCHED: mark: "kernel"
-; SCHED: child:
-; SCHED: context: "[b0, b1, t0, t1] -> { [] : 0 <= b0 <= 31 and 0 <= b1 <= 31 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
-; SCHED: child:
-; SCHED: filter: "[b0, b1] -> { Stmt_bb5[i0, i1] : -31 - 32b0 + i0 <= 8192*floor((i0)/8192) <= -32b0 + i0 and -31 - 32b1 + i1 <= 8192*floor((i1)/8192) <= -32b1 + i1 }"
-; SCHED: child:
-; SCHED: schedule: "[{ Stmt_bb5[i0, i1] -> [(floor((i0)/8192))] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/8192))] }]"
-; SCHED: permutable: 1
-; SCHED: coincident: [ 1, 1 ]
-; SCHED: child:
-; SCHED: filter: "[t0, t1] -> { Stmt_bb5[i0, i1] : 32*floor((-t0 + i0)/32) = -t0 + i0 and 16*floor((-t1 + i1)/16) = -t1 + i1 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
-; SCHED: child:
-; SCHED: schedule: "[{ Stmt_bb5[i0, i1] -> [(0)] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/16) - 2*floor((i1)/32))] }]"
-; SCHED: permutable: 1
-; SCHED: coincident: [ 1, 1 ]
-; SCHED: - filter: "{ }"
+; SCHED-NEXT: child:
+; SCHED-NEXT: context: "{ [] }"
+; SCHED-NEXT: child:
+; SCHED-NEXT: extension: "{ }"
+; SCHED-NEXT: child:
+; SCHED-NEXT: sequence:
+; SCHED-NEXT: - filter: "{ }"
+; SCHED-NEXT: - filter: "{ Stmt_bb5[i0, i1] }"
+; SCHED-NEXT: child:
+; SCHED-NEXT: guard: "{ [] }"
+; SCHED-NEXT: child:
+; SCHED-NEXT: mark: "kernel"
+; SCHED-NEXT: child:
+; SCHED-NEXT: context: "[b0, b1, t0, t1] -> { [] : 0 <= b0 <= 31 and 0 <= b1 <= 31 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
+; SCHED-NEXT: child:
+; SCHED-NEXT: filter: "[b0, b1] -> { Stmt_bb5[i0, i1] : -31 - 32b0 + i0 <= 8192*floor((i0)/8192) <= -32b0 + i0 and -31 - 32b1 + i1 <= 8192*floor((i1)/8192) <= -32b1 + i1 }"
+; SCHED-NEXT: child:
+; SCHED-NEXT: schedule: "[{ Stmt_bb5[i0, i1] -> [(floor((i0)/8192))] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/8192))] }]"
+; SCHED-NEXT: permutable: 1
+; SCHED-NEXT: coincident: [ 1, 1 ]
+; SCHED-NEXT: child:
+; SCHED-NEXT: filter: "[t0, t1] -> { Stmt_bb5[i0, i1] : 32*floor((-t0 + i0)/32) = -t0 + i0 and 16*floor((-t1 + i1)/16) = -t1 + i1 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
+; SCHED-NEXT: child:
+; SCHED-NEXT: schedule: "[{ Stmt_bb5[i0, i1] -> [(0)] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/16) - 2*floor((i1)/32))] }]"
+; SCHED-NEXT: permutable: 1
+; SCHED-NEXT: coincident: [ 1, 1 ]
+; SCHED-NEXT: - filter: "{ }"
; CODE: Code
-; CODE: ====
-; CODE: # host
-; CODE: {
-; CODE: dim3 k0_dimBlock(16, 32);
-; CODE: dim3 k0_dimGrid(32, 32);
-; CODE: kernel0 <<<k0_dimGrid, k0_dimBlock>>> ();
-; CODE: }
+; CODE-NEXT: ====
+; CODE-NEXT: # host
+; CODE-NEXT: {
+; CODE-NEXT: dim3 k0_dimBlock(16, 32);
+; CODE-NEXT: dim3 k0_dimGrid(32, 32);
+; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> ();
+; CODE-NEXT: cudaCheckKernel();
+; CODE-NEXT: }
; CODE: # kernel0
-; CODE: for (int c3 = 0; c3 <= 1; c3 += 1)
-; CODE: Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3);
+; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1)
+; CODE-NEXT: Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3);
; void double_parallel_loop(float A[][1024]) {