From 69b46751804a8ac162fd219188cadddc6721ca44 Mon Sep 17 00:00:00 2001
From: Tobias Grosser <tobias@grosser.es>
Date: Thu, 14 Jul 2016 15:51:37 +0000
Subject: [PATCH] GPGPU: Generate an AST for the GPU-mapped schedule

For this we need to provide an explicit list of statements as they occur in
the polly::Scop to ppcg.

We also setup basic AST printing facilities to facilitate debugging. To allow
code reuse some (minor) changes in ppcg are have been necessary.

llvm-svn: 275436
---
 polly/lib/CodeGen/PPCGCodeGeneration.cpp | 131 ++++++++++++++++++++++++++++++-
 polly/lib/External/ppcg/cuda.c           |  19 ++---
 polly/lib/External/ppcg/cuda.h           |   3 +
 polly/lib/External/ppcg/gpu.c            |   2 +-
 polly/lib/External/ppcg/gpu.h            |   2 +
 polly/test/GPGPU/double-parallel-loop.ll |  18 +++++
 6 files changed, 161 insertions(+), 14 deletions(-)
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index bb6ea8cd..24fe61b 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -26,8 +26,11 @@
 #include "isl/union_map.h"
 
 extern "C" {
+#include "cuda.h"
 #include "gpu.h"
+#include "gpu_print.h"
 #include "ppcg.h"
+#include "schedule.h"
 }
 
 #include "llvm/Support/Debug.h"
@@ -41,6 +44,12 @@ static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule",
                                   cl::desc("Dump the computed GPU Schedule"),
                                   cl::Hidden, cl::init(false), cl::ZeroOrMore,
                                   cl::cat(PollyCategory));
+
+static cl::opt<bool>
+    DumpCode("polly-acc-dump-code",
+             cl::desc("Dump C code describing the GPU mapping"), cl::Hidden,
+             cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
+
 /// Create the ast expressions for a ScopStmt.
 ///
 /// This function is a callback for to generate the ast expressions for each
@@ -256,6 +265,33 @@ public:
     return PPCGScop;
   }
 
+  /// Collect the list of GPU statements.
+  ///
+  /// Each statement has an id, a pointer to the underlying data structure,
+  /// as well as a list with all memory accesses.
+  ///
+  /// TODO: Initialize the list of memory accesses.
+  ///
+  /// @returns A linked-list of statements.
+  gpu_stmt *getStatements() {
+    gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt,
+                                       std::distance(S->begin(), S->end()));
+
+    int i = 0;
+    for (auto &Stmt : *S) {
+      gpu_stmt *GPUStmt = &Stmts[i];
+
+      GPUStmt->id = Stmt.getDomainId();
+
+      // We use the pet stmt pointer to keep track of the Polly statements.
+      GPUStmt->stmt = (pet_stmt *)&Stmt;
+      GPUStmt->accesses = nullptr;
+      i++;
+    }
+
+    return Stmts;
+  }
+
   /// Create a default-initialized PPCG GPU program.
   ///
   /// @returns A new gpu grogram description.
@@ -278,14 +314,90 @@ public:
     PPCGProg->to_inner = nullptr;
     PPCGProg->any_to_outer = nullptr;
     PPCGProg->array_order = nullptr;
-    PPCGProg->n_stmts = 0;
-    PPCGProg->stmts = nullptr;
+    PPCGProg->n_stmts = std::distance(S->begin(), S->end());
+    PPCGProg->stmts = getStatements();
     PPCGProg->n_array = 0;
     PPCGProg->array = nullptr;
 
     return PPCGProg;
   }
 
+  struct PrintGPUUserData {
+    struct cuda_info *CudaInfo;
+    struct gpu_prog *PPCGProg;
+    std::vector<ppcg_kernel *> Kernels;
+  };
+
+  /// Print a user statement node in the host code.
+  ///
+  /// We use ppcg's printing facilities to print the actual statement and
+  /// additionally build up a list of all kernels that are encountered in the
+  /// host ast.
+  ///
+  /// @param P The printer to print to
+  /// @param Options The printing options to use
+  /// @param Node The node to print
+  /// @param User A user pointer to carry additional data. This pointer is
+  ///             expected to be of type PrintGPUUserData.
+  ///
+  /// @returns A printer to which the output has been printed.
+  static __isl_give isl_printer *
+  printHostUser(__isl_take isl_printer *P,
+                __isl_take isl_ast_print_options *Options,
+                __isl_take isl_ast_node *Node, void *User) {
+    auto Data = (struct PrintGPUUserData *)User;
+    auto Id = isl_ast_node_get_annotation(Node);
+
+    if (Id) {
+      auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id);
+      isl_id_free(Id);
+      Data->Kernels.push_back(Kernel);
+    }
+
+    return print_host_user(P, Options, Node, User);
+  }
+
+  /// Print C code corresponding to the control flow in @p Kernel.
+  ///
+  /// @param Kernel The kernel to print
+  void printKernel(ppcg_kernel *Kernel) {
+    auto *P = isl_printer_to_str(S->getIslCtx());
+    P = isl_printer_set_output_format(P, ISL_FORMAT_C);
+    auto *Options = isl_ast_print_options_alloc(S->getIslCtx());
+    P = isl_ast_node_print(Kernel->tree, P, Options);
+    char *String = isl_printer_get_str(P);
+    printf("%s\n", String);
+    free(String);
+    isl_printer_free(P);
+  }
+
+  /// Print C code corresponding to the GPU code described by @p Tree.
+  ///
+  /// @param Tree An AST describing GPU code
+  /// @param PPCGProg The PPCG program from which @Tree has been constructed.
+  void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) {
+    auto *P = isl_printer_to_str(S->getIslCtx());
+    P = isl_printer_set_output_format(P, ISL_FORMAT_C);
+
+    PrintGPUUserData Data;
+    Data.PPCGProg = PPCGProg;
+
+    auto *Options = isl_ast_print_options_alloc(S->getIslCtx());
+    Options =
+        isl_ast_print_options_set_print_user(Options, printHostUser, &Data);
+    P = isl_ast_node_print(Tree, P, Options);
+    char *String = isl_printer_get_str(P);
+    printf("# host\n");
+    printf("%s\n", String);
+    free(String);
+    isl_printer_free(P);
+
+    for (auto Kernel : Data.Kernels) {
+      printf("# kernel%d\n", Kernel->id);
+      printKernel(Kernel);
+    }
+  }
+
   // Generate a GPU program using PPCG.
   //
   // GPU mapping consists of multiple steps:
@@ -322,10 +434,12 @@ public:
 
     int has_permutable = has_any_permutable_node(Schedule);
 
-    if (!has_permutable || has_permutable < 0)
+    if (!has_permutable || has_permutable < 0) {
       Schedule = isl_schedule_free(Schedule);
-    else
+    } else {
       Schedule = map_to_device(PPCGGen, Schedule);
+      PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule));
+    }
 
     if (DumpSchedule) {
       isl_printer *P = isl_printer_to_str(S->getIslCtx());
@@ -341,6 +455,15 @@ public:
       isl_printer_free(P);
     }
 
+    if (DumpCode) {
+      printf("Code\n");
+      printf("====\n");
+      if (PPCGGen->tree)
+        printGPUTree(PPCGGen->tree, PPCGProg);
+      else
+        printf("No code generated\n");
+    }
+
     isl_schedule_free(Schedule);
 
     return PPCGGen;
diff --git a/polly/lib/External/ppcg/cuda.c b/polly/lib/External/ppcg/cuda.c
index 3063f6d..1b605f5 100644
--- a/polly/lib/External/ppcg/cuda.c
+++ b/polly/lib/External/ppcg/cuda.c
@@ -153,20 +153,20 @@ static __isl_give isl_printer *copy_array_from_device(
 	return p;
 }
 
-static void print_reverse_list(FILE *out, int len, int *list)
+static isl_printer *print_reverse_list(isl_printer *p, int len, int *list)
 {
 	int i;
 
 	if (len == 0)
-		return;
+		return p;
 
-	fprintf(out, "(");
+        p = isl_printer_print_str(p, "(");
 	for (i = 0; i < len; ++i) {
 		if (i)
-			fprintf(out, ", ");
-		fprintf(out, "%d", list[len - 1 - i]);
+                        p = isl_printer_print_str(p, ", ");
+                p = isl_printer_print_int(p, list[len - 1 - i]);
 	}
-	fprintf(out, ")");
+        return isl_printer_print_str(p, ")");
 }
 
 /* Print the effective grid size as a list of the sizes in each
@@ -534,7 +534,7 @@ struct print_host_user_data {
  * In case of a kernel launch, print a block of statements that
  * defines the grid and the block and then launches the kernel.
  */
-static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
+__isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
 	__isl_take isl_ast_print_options *print_options,
 	__isl_keep isl_ast_node *node, void *user)
 {
@@ -569,8 +569,7 @@ static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
 	p = isl_printer_print_str(p, "dim3 k");
 	p = isl_printer_print_int(p, kernel->id);
 	p = isl_printer_print_str(p, "_dimBlock");
-	print_reverse_list(isl_printer_get_file(p),
-				kernel->n_block, kernel->block_dim);
+	p = print_reverse_list(p, kernel->n_block, kernel->block_dim);
 	p = isl_printer_print_str(p, ";");
 	p = isl_printer_end_line(p);
 
@@ -600,7 +599,9 @@ static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
 	p = isl_printer_start_line(p);
 	p = isl_printer_end_line(p);
 
+#if 0
 	print_kernel(data->prog, kernel, data->cuda);
+#endif
 
 	return p;
 }
diff --git a/polly/lib/External/ppcg/cuda.h b/polly/lib/External/ppcg/cuda.h
index 89175fd..bd8dd3d 100644
--- a/polly/lib/External/ppcg/cuda.h
+++ b/polly/lib/External/ppcg/cuda.h
@@ -6,5 +6,8 @@
 
 int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
 	const char *input);
+__isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
+	__isl_take isl_ast_print_options *print_options,
+	__isl_keep isl_ast_node *node, void *user);
 
 #endif
diff --git a/polly/lib/External/ppcg/gpu.c b/polly/lib/External/ppcg/gpu.c
index 7bda56a..218b918 100644
--- a/polly/lib/External/ppcg/gpu.c
+++ b/polly/lib/External/ppcg/gpu.c
@@ -2297,7 +2297,7 @@ static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user)
  * The ASTs for the device code are embedded in ppcg_kernel objects
  * attached to the leaf nodes that call "kernel".
  */
-static __isl_give isl_ast_node *generate_code(struct gpu_gen *gen,
+__isl_give isl_ast_node *generate_code(struct gpu_gen *gen,
 	__isl_take isl_schedule *schedule)
 {
 	struct ppcg_at_domain_data data;
diff --git a/polly/lib/External/ppcg/gpu.h b/polly/lib/External/ppcg/gpu.h
index 7038901..7d617de 100644
--- a/polly/lib/External/ppcg/gpu.h
+++ b/polly/lib/External/ppcg/gpu.h
@@ -367,4 +367,6 @@ __isl_give isl_schedule *get_schedule(struct gpu_gen *gen);
 int has_any_permutable_node(__isl_keep isl_schedule *schedule);
 __isl_give isl_schedule *map_to_device(struct gpu_gen *gen,
                                        __isl_take isl_schedule *schedule);
+__isl_give isl_ast_node *generate_code(struct gpu_gen *gen,
+                                       __isl_take isl_schedule *schedule);
 #endif
diff --git a/polly/test/GPGPU/double-parallel-loop.ll b/polly/test/GPGPU/double-parallel-loop.ll
index 963a411..46aab52 100644
--- a/polly/test/GPGPU/double-parallel-loop.ll
+++ b/polly/test/GPGPU/double-parallel-loop.ll
@@ -3,6 +3,10 @@
 ; RUN: -disable-output < %s | \
 ; RUN: FileCheck -check-prefix=SCHED %s
 
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=CODE %s
+
 ; REQUIRES: pollyacc
 
 ; CHECK: Stmt_bb5
@@ -44,6 +48,20 @@
 ; SCHED:                       coincident: [ 1, 1 ]
 ; SCHED:       - filter: "{  }"
 
+; CODE: Code
+; CODE: ====
+; CODE: # host
+; CODE: {
+; CODE:   dim3 k0_dimBlock(16, 32);
+; CODE:   dim3 k0_dimGrid(32, 32);
+; CODE:   kernel0 <<<k0_dimGrid, k0_dimBlock>>> ();
+; CODE: }
+
+; CODE: # kernel0
+; CODE: for (int c3 = 0; c3 <= 1; c3 += 1)
+; CODE:   Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3);
+
+
 ;    void double_parallel_loop(float A[][1024]) {
 ;      for (long i = 0; i < 1024; i++)
 ;        for (long j = 0; j < 1024; j++)
-- 
2.7.4