From 32837fe31311dff7e7d7f6960e142b7734eac60c Mon Sep 17 00:00:00 2001 From: Tobias Grosser Date: Tue, 19 Jul 2016 07:32:38 +0000 Subject: [PATCH] GPGPU: create kernel function skeleton Create for each kernel a separate LLVM-IR module containing a single function marked as kernel function and taking one pointer for each array referenced by this kernel. Add debugging output to verify the kernels are generated correctly. llvm-svn: 275952 --- polly/lib/CodeGen/PPCGCodeGeneration.cpp | 160 ++++++++++++++++++++- polly/test/GPGPU/kernel-params-only-some-arrays.ll | 76 ++++++++++ 2 files changed, 229 insertions(+), 7 deletions(-) create mode 100644 polly/test/GPGPU/kernel-params-only-some-arrays.ll diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index 33cf8c3..fbf9e25 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -51,6 +51,11 @@ static cl::opt cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); +static cl::opt DumpKernelIR("polly-acc-dump-kernel-ir", + cl::desc("Dump the kernel LLVM-IR"), + cl::Hidden, cl::init(false), cl::ZeroOrMore, + cl::cat(PollyCategory)); + /// Create the ast expressions for a ScopStmt. /// /// This function is a callback for to generate the ast expressions for each @@ -80,10 +85,18 @@ class GPUNodeBuilder : public IslNodeBuilder { public: GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P, const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, - DominatorTree &DT, Scop &S) - : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S) {} + DominatorTree &DT, Scop &S, gpu_prog *Prog) + : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) {} private: + /// A module containing GPU code. + /// + /// This pointer is only set in case we are currently generating GPU code. + std::unique_ptr GPUModule; + + /// The GPU program we generate code for. + gpu_prog *Prog; + /// Create code for user-defined AST nodes. /// /// These AST nodes can be of type: @@ -94,13 +107,145 @@ private: /// /// @param UserStmt The ast node to generate code for. virtual void createUser(__isl_take isl_ast_node *UserStmt); + + /// Create GPU kernel. + /// + /// Code generate the kernel described by @p KernelStmt. + /// + /// @param KernelStmt The ast node to generate kernel code for. + void createKernel(__isl_take isl_ast_node *KernelStmt); + + /// Create kernel function. + /// + /// Create a kernel function located in a newly created module that can serve + /// as target for device code generation. Set the Builder to point to the + /// start block of this newly created function. + /// + /// @param Kernel The kernel to generate code for. + void createKernelFunction(ppcg_kernel *Kernel); + + /// Create the declaration of a kernel function. + /// + /// The kernel function takes as arguments: + /// + /// - One i8 pointer for each external array reference used in the kernel. + /// - Host iterators (TODO) + /// - Parameters (TODO) + /// - Other LLVM Value references (TODO) + /// + /// @param Kernel The kernel to generate the function declaration for. + /// @returns The newly declared function. + Function *createKernelFunctionDecl(ppcg_kernel *Kernel); + + /// Finalize the generation of the kernel function. + /// + /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- + /// dump its IR to stderr. + void finalizeKernelFunction(); }; void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { + isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); + isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); + isl_id *Id = isl_ast_expr_get_id(StmtExpr); + isl_id_free(Id); + isl_ast_expr_free(StmtExpr); + + const char *Str = isl_id_get_name(Id); + if (!strcmp(Str, "kernel")) { + createKernel(UserStmt); + isl_ast_expr_free(Expr); + return; + } + + isl_ast_expr_free(Expr); isl_ast_node_free(UserStmt); return; } +void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { + isl_id *Id = isl_ast_node_get_annotation(KernelStmt); + ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); + isl_id_free(Id); + isl_ast_node_free(KernelStmt); + + assert(Kernel->tree && "Device AST of kernel node is empty"); + + Instruction &HostInsertPoint = *Builder.GetInsertPoint(); + + createKernelFunction(Kernel); + + Builder.SetInsertPoint(&HostInsertPoint); + + finalizeKernelFunction(); +} + +/// Compute the DataLayout string for the NVPTX backend. +/// +/// @param is64Bit Are we looking for a 64 bit architecture? +static std::string computeNVPTXDataLayout(bool is64Bit) { + std::string Ret = "e"; + + if (!is64Bit) + Ret += "-p:32:32"; + + Ret += "-i64:64-v16:16-v32:32-n16:32:64"; + + return Ret; +} + +Function *GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel) { + std::vector Args; + std::string Identifier = "kernel_" + std::to_string(Kernel->id); + + for (long i = 0; i < Prog->n_array; i++) { + if (!ppcg_kernel_requires_array_argument(Kernel, i)) + continue; + + Args.push_back(Builder.getInt8PtrTy()); + } + + auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); + auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, + GPUModule.get()); + FN->setCallingConv(CallingConv::PTX_Kernel); + + auto Arg = FN->arg_begin(); + for (long i = 0; i < Kernel->n_array; i++) { + if (!ppcg_kernel_requires_array_argument(Kernel, i)) + continue; + + Arg->setName(Prog->array[i].name); + Arg++; + } + + return FN; +} + +void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel) { + + std::string Identifier = "kernel_" + std::to_string(Kernel->id); + GPUModule.reset(new Module(Identifier, Builder.getContext())); + GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); + GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); + + Function *FN = createKernelFunctionDecl(Kernel); + + auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); + + Builder.SetInsertPoint(EntryBlock); + Builder.CreateRetVoid(); + Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); +} + +void GPUNodeBuilder::finalizeKernelFunction() { + + if (DumpKernelIR) + outs() << *GPUModule << "\n"; + + GPUModule.release(); +} + namespace { class PPCGCodeGeneration : public ScopPass { public: @@ -693,8 +838,9 @@ public: /// Generate code for a given GPU AST described by @p Root. /// - /// @param An isl_ast_node pointing to the root of the GPU AST. - void generateCode(__isl_take isl_ast_node *Root) { + /// @param Root An isl_ast_node pointing to the root of the GPU AST. + /// @param Prog The GPU Program to generate code for. + void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { ScopAnnotator Annotator; Annotator.buildAliasScopes(*S); @@ -706,8 +852,8 @@ public: PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator); - GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT, - *S); + GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT, *S, + Prog); // Only build the run-time condition and parameters _after_ having // introduced the conditional branch. This is important as the conditional @@ -741,7 +887,7 @@ public: auto PPCGGen = generateGPU(PPCGScop, PPCGProg); if (PPCGGen->tree) - generateCode(isl_ast_node_copy(PPCGGen->tree)); + generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); freeOptions(PPCGScop); freePPCGGen(PPCGGen); diff --git a/polly/test/GPGPU/kernel-params-only-some-arrays.ll b/polly/test/GPGPU/kernel-params-only-some-arrays.ll new file mode 100644 index 0000000..3bd3f96 --- /dev/null +++ b/polly/test/GPGPU/kernel-params-only-some-arrays.ll @@ -0,0 +1,76 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=KERNEL %s +; +; void kernel_params_only_some_arrays(float A[], float B[]) { +; for (long i = 0; i < 32; i++) +; A[i] += 42; +; +; for (long i = 0; i < 32; i++) +; B[i] += 42; +; } + +; KERNEL: ; ModuleID = 'kernel_0' +; KERNEL-NEXT: source_filename = "kernel_0" +; KERNEL-NEXT: target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda" + +; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A) { +; KERNEL-NEXT: entry: +; KERNEL-NEXT: ret void +; KERNEL-NEXT: } + +; KERNEL: ; ModuleID = 'kernel_1' +; KERNEL-NEXT: source_filename = "kernel_1" +; KERNEL-NEXT: target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda" + +; KERNEL: define ptx_kernel void @kernel_1(i8* %MemRef_B) { +; KERNEL-NEXT: entry: +; KERNEL-NEXT: ret void +; KERNEL-NEXT: } + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @kernel_params_only_some_arrays(float* %A, float* %B) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ] + %exitcond1 = icmp ne i64 %i.0, 32 + br i1 %exitcond1, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %arrayidx = getelementptr inbounds float, float* %A, i64 %i.0 + %tmp = load float, float* %arrayidx, align 4 + %add = fadd float %tmp, 4.200000e+01 + store float %add, float* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nuw nsw i64 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + br label %for.cond2 + +for.cond2: ; preds = %for.inc7, %for.end + %i1.0 = phi i64 [ 0, %for.end ], [ %inc8, %for.inc7 ] + %exitcond = icmp ne i64 %i1.0, 32 + br i1 %exitcond, label %for.body4, label %for.end9 + +for.body4: ; preds = %for.cond2 + %arrayidx5 = getelementptr inbounds float, float* %B, i64 %i1.0 + %tmp2 = load float, float* %arrayidx5, align 4 + %add6 = fadd float %tmp2, 4.200000e+01 + store float %add6, float* %arrayidx5, align 4 + br label %for.inc7 + +for.inc7: ; preds = %for.body4 + %inc8 = add nuw nsw i64 %i1.0, 1 + br label %for.cond2 + +for.end9: ; preds = %for.cond2 + ret void +} -- 2.7.4