cl::desc("Dump C code describing the GPU mapping"), cl::Hidden,
cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
+static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir",
+ cl::desc("Dump the kernel LLVM-IR"),
+ cl::Hidden, cl::init(false), cl::ZeroOrMore,
+ cl::cat(PollyCategory));
+
/// Create the ast expressions for a ScopStmt.
///
/// This function is a callback for to generate the ast expressions for each
public:
GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P,
const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
- DominatorTree &DT, Scop &S)
- : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S) {}
+ DominatorTree &DT, Scop &S, gpu_prog *Prog)
+ : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) {}
private:
+ /// A module containing GPU code.
+ ///
+ /// This pointer is only set in case we are currently generating GPU code.
+ std::unique_ptr<Module> GPUModule;
+
+ /// The GPU program we generate code for.
+ gpu_prog *Prog;
+
/// Create code for user-defined AST nodes.
///
/// These AST nodes can be of type:
///
/// @param UserStmt The ast node to generate code for.
virtual void createUser(__isl_take isl_ast_node *UserStmt);
+
+ /// Create GPU kernel.
+ ///
+ /// Code generate the kernel described by @p KernelStmt.
+ ///
+ /// @param KernelStmt The ast node to generate kernel code for.
+ void createKernel(__isl_take isl_ast_node *KernelStmt);
+
+ /// Create kernel function.
+ ///
+ /// Create a kernel function located in a newly created module that can serve
+ /// as target for device code generation. Set the Builder to point to the
+ /// start block of this newly created function.
+ ///
+ /// @param Kernel The kernel to generate code for.
+ void createKernelFunction(ppcg_kernel *Kernel);
+
+ /// Create the declaration of a kernel function.
+ ///
+ /// The kernel function takes as arguments:
+ ///
+ /// - One i8 pointer for each external array reference used in the kernel.
+ /// - Host iterators (TODO)
+ /// - Parameters (TODO)
+ /// - Other LLVM Value references (TODO)
+ ///
+ /// @param Kernel The kernel to generate the function declaration for.
+ /// @returns The newly declared function.
+ Function *createKernelFunctionDecl(ppcg_kernel *Kernel);
+
+ /// Finalize the generation of the kernel function.
+ ///
+ /// Free the LLVM-IR module corresponding to the kernel and -- if requested --
+ /// dump its IR to stderr.
+ void finalizeKernelFunction();
};
void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
+ isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt);
+ isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
+ isl_id *Id = isl_ast_expr_get_id(StmtExpr);
+ isl_id_free(Id);
+ isl_ast_expr_free(StmtExpr);
+
+ const char *Str = isl_id_get_name(Id);
+ if (!strcmp(Str, "kernel")) {
+ createKernel(UserStmt);
+ isl_ast_expr_free(Expr);
+ return;
+ }
+
+ isl_ast_expr_free(Expr);
isl_ast_node_free(UserStmt);
return;
}
+void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
+ isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
+ ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
+ isl_id_free(Id);
+ isl_ast_node_free(KernelStmt);
+
+ assert(Kernel->tree && "Device AST of kernel node is empty");
+
+ Instruction &HostInsertPoint = *Builder.GetInsertPoint();
+
+ createKernelFunction(Kernel);
+
+ Builder.SetInsertPoint(&HostInsertPoint);
+
+ finalizeKernelFunction();
+}
+
+/// Compute the DataLayout string for the NVPTX backend.
+///
+/// @param is64Bit Are we looking for a 64 bit architecture?
+static std::string computeNVPTXDataLayout(bool is64Bit) {
+ std::string Ret = "e";
+
+ if (!is64Bit)
+ Ret += "-p:32:32";
+
+ Ret += "-i64:64-v16:16-v32:32-n16:32:64";
+
+ return Ret;
+}
+
+Function *GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel) {
+ std::vector<Type *> Args;
+ std::string Identifier = "kernel_" + std::to_string(Kernel->id);
+
+ for (long i = 0; i < Prog->n_array; i++) {
+ if (!ppcg_kernel_requires_array_argument(Kernel, i))
+ continue;
+
+ Args.push_back(Builder.getInt8PtrTy());
+ }
+
+ auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
+ auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
+ GPUModule.get());
+ FN->setCallingConv(CallingConv::PTX_Kernel);
+
+ auto Arg = FN->arg_begin();
+ for (long i = 0; i < Kernel->n_array; i++) {
+ if (!ppcg_kernel_requires_array_argument(Kernel, i))
+ continue;
+
+ Arg->setName(Prog->array[i].name);
+ Arg++;
+ }
+
+ return FN;
+}
+
+void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel) {
+
+ std::string Identifier = "kernel_" + std::to_string(Kernel->id);
+ GPUModule.reset(new Module(Identifier, Builder.getContext()));
+ GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
+ GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
+
+ Function *FN = createKernelFunctionDecl(Kernel);
+
+ auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN);
+
+ Builder.SetInsertPoint(EntryBlock);
+ Builder.CreateRetVoid();
+ Builder.SetInsertPoint(EntryBlock, EntryBlock->begin());
+}
+
+void GPUNodeBuilder::finalizeKernelFunction() {
+
+ if (DumpKernelIR)
+ outs() << *GPUModule << "\n";
+
+ GPUModule.release();
+}
+
namespace {
class PPCGCodeGeneration : public ScopPass {
public:
/// Generate code for a given GPU AST described by @p Root.
///
- /// @param An isl_ast_node pointing to the root of the GPU AST.
- void generateCode(__isl_take isl_ast_node *Root) {
+ /// @param Root An isl_ast_node pointing to the root of the GPU AST.
+ /// @param Prog The GPU Program to generate code for.
+ void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) {
ScopAnnotator Annotator;
Annotator.buildAliasScopes(*S);
PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator);
- GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT,
- *S);
+ GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT, *S,
+ Prog);
// Only build the run-time condition and parameters _after_ having
// introduced the conditional branch. This is important as the conditional
auto PPCGGen = generateGPU(PPCGScop, PPCGProg);
if (PPCGGen->tree)
- generateCode(isl_ast_node_copy(PPCGGen->tree));
+ generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg);
freeOptions(PPCGScop);
freePPCGGen(PPCGGen);
--- /dev/null
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=KERNEL %s
+;
+; void kernel_params_only_some_arrays(float A[], float B[]) {
+; for (long i = 0; i < 32; i++)
+; A[i] += 42;
+;
+; for (long i = 0; i < 32; i++)
+; B[i] += 42;
+; }
+
+; KERNEL: ; ModuleID = 'kernel_0'
+; KERNEL-NEXT: source_filename = "kernel_0"
+; KERNEL-NEXT: target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
+
+; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A) {
+; KERNEL-NEXT: entry:
+; KERNEL-NEXT: ret void
+; KERNEL-NEXT: }
+
+; KERNEL: ; ModuleID = 'kernel_1'
+; KERNEL-NEXT: source_filename = "kernel_1"
+; KERNEL-NEXT: target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
+
+; KERNEL: define ptx_kernel void @kernel_1(i8* %MemRef_B) {
+; KERNEL-NEXT: entry:
+; KERNEL-NEXT: ret void
+; KERNEL-NEXT: }
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @kernel_params_only_some_arrays(float* %A, float* %B) {
+entry:
+ br label %for.cond
+
+for.cond: ; preds = %for.inc, %entry
+ %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ]
+ %exitcond1 = icmp ne i64 %i.0, 32
+ br i1 %exitcond1, label %for.body, label %for.end
+
+for.body: ; preds = %for.cond
+ %arrayidx = getelementptr inbounds float, float* %A, i64 %i.0
+ %tmp = load float, float* %arrayidx, align 4
+ %add = fadd float %tmp, 4.200000e+01
+ store float %add, float* %arrayidx, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body
+ %inc = add nuw nsw i64 %i.0, 1
+ br label %for.cond
+
+for.end: ; preds = %for.cond
+ br label %for.cond2
+
+for.cond2: ; preds = %for.inc7, %for.end
+ %i1.0 = phi i64 [ 0, %for.end ], [ %inc8, %for.inc7 ]
+ %exitcond = icmp ne i64 %i1.0, 32
+ br i1 %exitcond, label %for.body4, label %for.end9
+
+for.body4: ; preds = %for.cond2
+ %arrayidx5 = getelementptr inbounds float, float* %B, i64 %i1.0
+ %tmp2 = load float, float* %arrayidx5, align 4
+ %add6 = fadd float %tmp2, 4.200000e+01
+ store float %add6, float* %arrayidx5, align 4
+ br label %for.inc7
+
+for.inc7: ; preds = %for.body4
+ %inc8 = add nuw nsw i64 %i1.0, 1
+ br label %for.cond2
+
+for.end9: ; preds = %for.cond2
+ ret void
+}