From 69b46751804a8ac162fd219188cadddc6721ca44 Mon Sep 17 00:00:00 2001 From: Tobias Grosser Date: Thu, 14 Jul 2016 15:51:37 +0000 Subject: [PATCH] GPGPU: Generate an AST for the GPU-mapped schedule For this we need to provide an explicit list of statements as they occur in the polly::Scop to ppcg. We also setup basic AST printing facilities to facilitate debugging. To allow code reuse some (minor) changes in ppcg are have been necessary. llvm-svn: 275436 --- polly/lib/CodeGen/PPCGCodeGeneration.cpp | 131 ++++++++++++++++++++++++++++++- polly/lib/External/ppcg/cuda.c | 19 ++--- polly/lib/External/ppcg/cuda.h | 3 + polly/lib/External/ppcg/gpu.c | 2 +- polly/lib/External/ppcg/gpu.h | 2 + polly/test/GPGPU/double-parallel-loop.ll | 18 +++++ 6 files changed, 161 insertions(+), 14 deletions(-) diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index bb6ea8cd..24fe61b 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -26,8 +26,11 @@ #include "isl/union_map.h" extern "C" { +#include "cuda.h" #include "gpu.h" +#include "gpu_print.h" #include "ppcg.h" +#include "schedule.h" } #include "llvm/Support/Debug.h" @@ -41,6 +44,12 @@ static cl::opt DumpSchedule("polly-acc-dump-schedule", cl::desc("Dump the computed GPU Schedule"), cl::Hidden, cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); + +static cl::opt + DumpCode("polly-acc-dump-code", + cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, + cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); + /// Create the ast expressions for a ScopStmt. /// /// This function is a callback for to generate the ast expressions for each @@ -256,6 +265,33 @@ public: return PPCGScop; } + /// Collect the list of GPU statements. + /// + /// Each statement has an id, a pointer to the underlying data structure, + /// as well as a list with all memory accesses. + /// + /// TODO: Initialize the list of memory accesses. + /// + /// @returns A linked-list of statements. + gpu_stmt *getStatements() { + gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt, + std::distance(S->begin(), S->end())); + + int i = 0; + for (auto &Stmt : *S) { + gpu_stmt *GPUStmt = &Stmts[i]; + + GPUStmt->id = Stmt.getDomainId(); + + // We use the pet stmt pointer to keep track of the Polly statements. + GPUStmt->stmt = (pet_stmt *)&Stmt; + GPUStmt->accesses = nullptr; + i++; + } + + return Stmts; + } + /// Create a default-initialized PPCG GPU program. /// /// @returns A new gpu grogram description. @@ -278,14 +314,90 @@ public: PPCGProg->to_inner = nullptr; PPCGProg->any_to_outer = nullptr; PPCGProg->array_order = nullptr; - PPCGProg->n_stmts = 0; - PPCGProg->stmts = nullptr; + PPCGProg->n_stmts = std::distance(S->begin(), S->end()); + PPCGProg->stmts = getStatements(); PPCGProg->n_array = 0; PPCGProg->array = nullptr; return PPCGProg; } + struct PrintGPUUserData { + struct cuda_info *CudaInfo; + struct gpu_prog *PPCGProg; + std::vector Kernels; + }; + + /// Print a user statement node in the host code. + /// + /// We use ppcg's printing facilities to print the actual statement and + /// additionally build up a list of all kernels that are encountered in the + /// host ast. + /// + /// @param P The printer to print to + /// @param Options The printing options to use + /// @param Node The node to print + /// @param User A user pointer to carry additional data. This pointer is + /// expected to be of type PrintGPUUserData. + /// + /// @returns A printer to which the output has been printed. + static __isl_give isl_printer * + printHostUser(__isl_take isl_printer *P, + __isl_take isl_ast_print_options *Options, + __isl_take isl_ast_node *Node, void *User) { + auto Data = (struct PrintGPUUserData *)User; + auto Id = isl_ast_node_get_annotation(Node); + + if (Id) { + auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); + isl_id_free(Id); + Data->Kernels.push_back(Kernel); + } + + return print_host_user(P, Options, Node, User); + } + + /// Print C code corresponding to the control flow in @p Kernel. + /// + /// @param Kernel The kernel to print + void printKernel(ppcg_kernel *Kernel) { + auto *P = isl_printer_to_str(S->getIslCtx()); + P = isl_printer_set_output_format(P, ISL_FORMAT_C); + auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); + P = isl_ast_node_print(Kernel->tree, P, Options); + char *String = isl_printer_get_str(P); + printf("%s\n", String); + free(String); + isl_printer_free(P); + } + + /// Print C code corresponding to the GPU code described by @p Tree. + /// + /// @param Tree An AST describing GPU code + /// @param PPCGProg The PPCG program from which @Tree has been constructed. + void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { + auto *P = isl_printer_to_str(S->getIslCtx()); + P = isl_printer_set_output_format(P, ISL_FORMAT_C); + + PrintGPUUserData Data; + Data.PPCGProg = PPCGProg; + + auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); + Options = + isl_ast_print_options_set_print_user(Options, printHostUser, &Data); + P = isl_ast_node_print(Tree, P, Options); + char *String = isl_printer_get_str(P); + printf("# host\n"); + printf("%s\n", String); + free(String); + isl_printer_free(P); + + for (auto Kernel : Data.Kernels) { + printf("# kernel%d\n", Kernel->id); + printKernel(Kernel); + } + } + // Generate a GPU program using PPCG. // // GPU mapping consists of multiple steps: @@ -322,10 +434,12 @@ public: int has_permutable = has_any_permutable_node(Schedule); - if (!has_permutable || has_permutable < 0) + if (!has_permutable || has_permutable < 0) { Schedule = isl_schedule_free(Schedule); - else + } else { Schedule = map_to_device(PPCGGen, Schedule); + PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); + } if (DumpSchedule) { isl_printer *P = isl_printer_to_str(S->getIslCtx()); @@ -341,6 +455,15 @@ public: isl_printer_free(P); } + if (DumpCode) { + printf("Code\n"); + printf("====\n"); + if (PPCGGen->tree) + printGPUTree(PPCGGen->tree, PPCGProg); + else + printf("No code generated\n"); + } + isl_schedule_free(Schedule); return PPCGGen; diff --git a/polly/lib/External/ppcg/cuda.c b/polly/lib/External/ppcg/cuda.c index 3063f6d..1b605f5 100644 --- a/polly/lib/External/ppcg/cuda.c +++ b/polly/lib/External/ppcg/cuda.c @@ -153,20 +153,20 @@ static __isl_give isl_printer *copy_array_from_device( return p; } -static void print_reverse_list(FILE *out, int len, int *list) +static isl_printer *print_reverse_list(isl_printer *p, int len, int *list) { int i; if (len == 0) - return; + return p; - fprintf(out, "("); + p = isl_printer_print_str(p, "("); for (i = 0; i < len; ++i) { if (i) - fprintf(out, ", "); - fprintf(out, "%d", list[len - 1 - i]); + p = isl_printer_print_str(p, ", "); + p = isl_printer_print_int(p, list[len - 1 - i]); } - fprintf(out, ")"); + return isl_printer_print_str(p, ")"); } /* Print the effective grid size as a list of the sizes in each @@ -534,7 +534,7 @@ struct print_host_user_data { * In case of a kernel launch, print a block of statements that * defines the grid and the block and then launches the kernel. */ -static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p, +__isl_give isl_printer *print_host_user(__isl_take isl_printer *p, __isl_take isl_ast_print_options *print_options, __isl_keep isl_ast_node *node, void *user) { @@ -569,8 +569,7 @@ static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p, p = isl_printer_print_str(p, "dim3 k"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, "_dimBlock"); - print_reverse_list(isl_printer_get_file(p), - kernel->n_block, kernel->block_dim); + p = print_reverse_list(p, kernel->n_block, kernel->block_dim); p = isl_printer_print_str(p, ";"); p = isl_printer_end_line(p); @@ -600,7 +599,9 @@ static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p, p = isl_printer_start_line(p); p = isl_printer_end_line(p); +#if 0 print_kernel(data->prog, kernel, data->cuda); +#endif return p; } diff --git a/polly/lib/External/ppcg/cuda.h b/polly/lib/External/ppcg/cuda.h index 89175fd..bd8dd3d 100644 --- a/polly/lib/External/ppcg/cuda.h +++ b/polly/lib/External/ppcg/cuda.h @@ -6,5 +6,8 @@ int generate_cuda(isl_ctx *ctx, struct ppcg_options *options, const char *input); +__isl_give isl_printer *print_host_user(__isl_take isl_printer *p, + __isl_take isl_ast_print_options *print_options, + __isl_keep isl_ast_node *node, void *user); #endif diff --git a/polly/lib/External/ppcg/gpu.c b/polly/lib/External/ppcg/gpu.c index 7bda56a..218b918 100644 --- a/polly/lib/External/ppcg/gpu.c +++ b/polly/lib/External/ppcg/gpu.c @@ -2297,7 +2297,7 @@ static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user) * The ASTs for the device code are embedded in ppcg_kernel objects * attached to the leaf nodes that call "kernel". */ -static __isl_give isl_ast_node *generate_code(struct gpu_gen *gen, +__isl_give isl_ast_node *generate_code(struct gpu_gen *gen, __isl_take isl_schedule *schedule) { struct ppcg_at_domain_data data; diff --git a/polly/lib/External/ppcg/gpu.h b/polly/lib/External/ppcg/gpu.h index 7038901..7d617de 100644 --- a/polly/lib/External/ppcg/gpu.h +++ b/polly/lib/External/ppcg/gpu.h @@ -367,4 +367,6 @@ __isl_give isl_schedule *get_schedule(struct gpu_gen *gen); int has_any_permutable_node(__isl_keep isl_schedule *schedule); __isl_give isl_schedule *map_to_device(struct gpu_gen *gen, __isl_take isl_schedule *schedule); +__isl_give isl_ast_node *generate_code(struct gpu_gen *gen, + __isl_take isl_schedule *schedule); #endif diff --git a/polly/test/GPGPU/double-parallel-loop.ll b/polly/test/GPGPU/double-parallel-loop.ll index 963a411..46aab52 100644 --- a/polly/test/GPGPU/double-parallel-loop.ll +++ b/polly/test/GPGPU/double-parallel-loop.ll @@ -3,6 +3,10 @@ ; RUN: -disable-output < %s | \ ; RUN: FileCheck -check-prefix=SCHED %s +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + ; REQUIRES: pollyacc ; CHECK: Stmt_bb5 @@ -44,6 +48,20 @@ ; SCHED: coincident: [ 1, 1 ] ; SCHED: - filter: "{ }" +; CODE: Code +; CODE: ==== +; CODE: # host +; CODE: { +; CODE: dim3 k0_dimBlock(16, 32); +; CODE: dim3 k0_dimGrid(32, 32); +; CODE: kernel0 <<>> (); +; CODE: } + +; CODE: # kernel0 +; CODE: for (int c3 = 0; c3 <= 1; c3 += 1) +; CODE: Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3); + + ; void double_parallel_loop(float A[][1024]) { ; for (long i = 0; i < 1024; i++) ; for (long j = 0; j < 1024; j++) -- 2.7.4