GPGPU: generate code for ScopStatements

author Tobias Grosser <tobias@grosser.es>

Thu, 21 Jul 2016 13:15:59 +0000 (13:15 +0000)

committer Tobias Grosser <tobias@grosser.es>

Thu, 21 Jul 2016 13:15:59 +0000 (13:15 +0000)
author Tobias Grosser <tobias@grosser.es>
Thu, 21 Jul 2016 13:15:59 +0000 (13:15 +0000)
committer Tobias Grosser <tobias@grosser.es>
Thu, 21 Jul 2016 13:15:59 +0000 (13:15 +0000)
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp

index 1621252..8b4d222 100644 (file)
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -18,6 +18,7 @@
  #include "polly/LinkAllPasses.h"
  #include "polly/Options.h"
  #include "polly/ScopInfo.h"
+#include "polly/Support/SCEVValidator.h"
  #include "llvm/Analysis/AliasAnalysis.h"
  #include "llvm/Analysis/BasicAliasAnalysis.h"
  #include "llvm/Analysis/GlobalsModRef.h"
@@ -61,17 +62,37 @@ static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir",
  /// This function is a callback for to generate the ast expressions for each
  /// of the scheduled ScopStmts.
  static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt(
-    void *Stmt, isl_ast_build *Build,
+    void *StmtT, isl_ast_build *Build,
      isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA,
                                         isl_id *Id, void *User),
      void *UserIndex,
      isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User),
-    void *User_expr) {
+    void *UserExpr) {
  
-  // TODO: Implement the AST expression generation. For now we just return a
-  // nullptr to ensure that we do not free uninitialized pointers.
+  ScopStmt *Stmt = (ScopStmt *)StmtT;
  
-  return nullptr;
+  isl_ctx *Ctx;
+
+  if (!Stmt || !Build)
+    return NULL;
+
+  Ctx = isl_ast_build_get_ctx(Build);
+  isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0);
+
+  for (MemoryAccess *Acc : *Stmt) {
+    isl_map *AddrFunc = Acc->getAddressFunction();
+    AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain());
+    isl_id *RefId = Acc->getId();
+    isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc);
+    isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA);
+    MPA = isl_multi_pw_aff_coalesce(MPA);
+    MPA = FunctionIndex(MPA, RefId, UserIndex);
+    isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA);
+    Access = FunctionExpr(Access, RefId, UserExpr);
+    RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access);
+  }
+
+  return RefToExpr;
  }
  
  /// Generate code for a GPU specific isl AST.
@@ -86,7 +107,9 @@ public:
    GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P,
                   const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
                   DominatorTree &DT, Scop &S, gpu_prog *Prog)
-      : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) {}
+      : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) {
+    getExprBuilder().setIDToSAI(&IDToSAI);
+  }
  
  private:
    /// A module containing GPU code.
@@ -108,6 +131,8 @@ private:
    /// By releasing this set all isl_ids will be freed.
    std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs;
  
+  IslExprBuilder::IDToScopArrayInfoTy IDToSAI;
+
    /// Create code for user-defined AST nodes.
    ///
    /// These AST nodes can be of type:
@@ -121,6 +146,13 @@ private:
    /// @param UserStmt The ast node to generate code for.
    virtual void createUser(__isl_take isl_ast_node *UserStmt);
  
+  /// Find llvm::Values referenced in GPU kernel.
+  ///
+  /// @param Kernel The kernel to scan for llvm::Values
+  ///
+  /// @returns A set of values referenced by the kernel.
+  SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel);
+
    /// Create GPU kernel.
    ///
    /// Code generate the kernel described by @p KernelStmt.
@@ -135,7 +167,9 @@ private:
    /// start block of this newly created function.
    ///
    /// @param Kernel The kernel to generate code for.
-  void createKernelFunction(ppcg_kernel *Kernel);
+  /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
+  void createKernelFunction(ppcg_kernel *Kernel,
+                            SetVector<Value *> &SubtreeValues);
  
    /// Create the declaration of a kernel function.
    ///
@@ -147,14 +181,23 @@ private:
    ///   - Other LLVM Value references (TODO)
    ///
    /// @param Kernel The kernel to generate the function declaration for.
+  /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
+  ///
    /// @returns The newly declared function.
-  Function *createKernelFunctionDecl(ppcg_kernel *Kernel);
+  Function *createKernelFunctionDecl(ppcg_kernel *Kernel,
+                                     SetVector<Value *> &SubtreeValues);
  
    /// Insert intrinsic functions to obtain thread and block ids.
    ///
    /// @param The kernel to generate the intrinsic functions for.
    void insertKernelIntrinsics(ppcg_kernel *Kernel);
  
+  /// Create code for a ScopStmt called in @p Expr.
+  ///
+  /// @param Expr The expression containing the call.
+  /// @param KernelStmt The kernel statement referenced in the call.
+  void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt);
+
    /// Create an in-kernel synchronization call.
    void createKernelSync();
  
@@ -201,8 +244,7 @@ void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
  
    switch (KernelStmt->type) {
    case ppcg_kernel_domain:
-    // TODO Create kernel user stmt
-    isl_ast_expr_free(Expr);
+    createScopStmt(Expr, KernelStmt);
      isl_ast_node_free(UserStmt);
      return;
    case ppcg_kernel_copy:
@@ -222,30 +264,143 @@ void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
    return;
  }
  
+void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr,
+                                    ppcg_kernel_stmt *KernelStmt) {
+  auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
+  isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr;
+
+  LoopToScevMapT LTS;
+  LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end());
+
+  createSubstitutions(Expr, Stmt, LTS);
+
+  if (Stmt->isBlockStmt())
+    BlockGen.copyStmt(*Stmt, LTS, Indexes);
+  else
+    assert(0 && "Region statement not supported\n");
+}
+
  void GPUNodeBuilder::createKernelSync() {
    Module *M = Builder.GetInsertBlock()->getParent()->getParent();
    auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
    Builder.CreateCall(Sync, {});
  }
  
+/// Collect llvm::Values referenced from @p Node
+///
+/// This function only applies to isl_ast_nodes that are user_nodes referring
+/// to a ScopStmt. All other node types are ignore.
+///
+/// @param Node The node to collect references for.
+/// @param User A user pointer used as storage for the data that is collected.
+///
+/// @returns isl_bool_true if data could be collected successfully.
+isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) {
+  if (isl_ast_node_get_type(Node) != isl_ast_node_user)
+    return isl_bool_true;
+
+  isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node);
+  isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
+  isl_id *Id = isl_ast_expr_get_id(StmtExpr);
+  const char *Str = isl_id_get_name(Id);
+  isl_id_free(Id);
+  isl_ast_expr_free(StmtExpr);
+  isl_ast_expr_free(Expr);
+
+  if (!isPrefix(Str, "Stmt"))
+    return isl_bool_true;
+
+  Id = isl_ast_node_get_annotation(Node);
+  auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id);
+  auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
+  isl_id_free(Id);
+
+  addReferencesFromStmt(Stmt, User);
+
+  return isl_bool_true;
+}
+
+SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
+  SetVector<Value *> SubtreeValues;
+  SetVector<const SCEV *> SCEVs;
+  SetVector<const Loop *> Loops;
+  SubtreeReferences References = {
+      LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()};
+
+  for (const auto &I : IDToValue)
+    SubtreeValues.insert(I.second);
+
+  isl_ast_node_foreach_descendant_top_down(
+      Kernel->tree, collectReferencesInGPUStmt, &References);
+
+  for (const SCEV *Expr : SCEVs)
+    findValues(Expr, SE, SubtreeValues);
+
+  for (auto &SAI : S.arrays())
+    SubtreeValues.remove(SAI.second->getBasePtr());
+
+  isl_space *Space = S.getParamSpace();
+  for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) {
+    isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i);
+    assert(IDToValue.count(Id));
+    Value *Val = IDToValue[Id];
+    SubtreeValues.remove(Val);
+    isl_id_free(Id);
+  }
+  isl_space_free(Space);
+
+  for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) {
+    isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
+    assert(IDToValue.count(Id));
+    Value *Val = IDToValue[Id];
+    SubtreeValues.remove(Val);
+    isl_id_free(Id);
+  }
+
+  return SubtreeValues;
+}
+
  void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
    isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
    ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
    isl_id_free(Id);
    isl_ast_node_free(KernelStmt);
  
+  SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel);
+
    assert(Kernel->tree && "Device AST of kernel node is empty");
  
    Instruction &HostInsertPoint = *Builder.GetInsertPoint();
    IslExprBuilder::IDToValueTy HostIDs = IDToValue;
+  ValueMapT HostValueMap = ValueMap;
+
+  SetVector<const Loop *> Loops;
+
+  // Create for all loops we depend on values that contain the current loop
+  // iteration. These values are necessary to generate code for SCEVs that
+  // depend on such loops. As a result we need to pass them to the subfunction.
+  for (const Loop *L : Loops) {
+    const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)),
+                                            SE.getUnknown(Builder.getInt64(1)),
+                                            L, SCEV::FlagAnyWrap);
+    Value *V = generateSCEV(OuterLIV);
+    OutsideLoopIterations[L] = SE.getUnknown(V);
+    SubtreeValues.insert(V);
+  }
  
-  createKernelFunction(Kernel);
+  createKernelFunction(Kernel, SubtreeValues);
  
    create(isl_ast_node_copy(Kernel->tree));
  
    Builder.SetInsertPoint(&HostInsertPoint);
    IDToValue = HostIDs;
  
+  ValueMap = HostValueMap;
+  ScalarMap.clear();
+  PHIOpMap.clear();
+  EscapeMap.clear();
+  IDToSAI.clear();
+
    finalizeKernelFunction();
  }
  
@@ -263,7 +418,9 @@ static std::string computeNVPTXDataLayout(bool is64Bit) {
    return Ret;
  }
  
-Function *GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel) {
+Function *
+GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
+                                         SetVector<Value *> &SubtreeValues) {
    std::vector<Type *> Args;
    std::string Identifier = "kernel_" + std::to_string(Kernel->id);
  
@@ -284,6 +441,9 @@ Function *GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel) {
    for (long i = 0; i < NumVars; i++)
      Args.push_back(Builder.getInt64Ty());
  
+  for (auto *V : SubtreeValues)
+    Args.push_back(V->getType());
+
    auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
    auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
                                GPUModule.get());
@@ -294,7 +454,27 @@ Function *GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel) {
      if (!ppcg_kernel_requires_array_argument(Kernel, i))
        continue;
  
-    Arg->setName(Prog->array[i].name);
+    Arg->setName(Kernel->array[i].array->name);
+
+    isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
+    const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id));
+    Type *EleTy = SAI->getElementType();
+    Value *Val = &*Arg;
+    SmallVector<const SCEV *, 4> Sizes;
+    isl_ast_build *Build =
+        isl_ast_build_from_context(isl_set_copy(Prog->context));
+    for (long j = 1; j < Kernel->array[i].array->n_index; j++) {
+      isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff(
+          Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j]));
+      auto V = ExprBuilder.create(DimSize);
+      Sizes.push_back(SE.getSCEV(V));
+    }
+    const ScopArrayInfo *SAIRep =
+        S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, ScopArrayInfo::MK_Array);
+
+    isl_ast_build_free(Build);
+    isl_id_free(Id);
+    IDToSAI[Id] = SAIRep;
      Arg++;
    }
  
@@ -314,6 +494,12 @@ Function *GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel) {
      Arg++;
    }
  
+  for (auto *V : SubtreeValues) {
+    Arg->setName(V->getName());
+    ValueMap[V] = &*Arg;
+    Arg++;
+  }
+
    return FN;
  }
  
@@ -346,14 +532,15 @@ void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
    }
  }
  
-void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel) {
+void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
+                                          SetVector<Value *> &SubtreeValues) {
  
    std::string Identifier = "kernel_" + std::to_string(Kernel->id);
    GPUModule.reset(new Module(Identifier, Builder.getContext()));
    GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
    GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
  
-  Function *FN = createKernelFunctionDecl(Kernel);
+  Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
  
    BasicBlock *PrevBlock = Builder.GetInsertBlock();
    auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN);
diff --git a/polly/test/GPGPU/double-parallel-loop.ll b/polly/test/GPGPU/double-parallel-loop.ll

index f9b3f1f..2eee8aa 100644 (file)
--- a/polly/test/GPGPU/double-parallel-loop.ll
+++ b/polly/test/GPGPU/double-parallel-loop.ll
@@ -105,18 +105,52 @@
  ; KERNEL-IR-NEXT:   %t1 = zext i32 %3 to i64
  ; KERNEL-IR-NEXT:   br label %polly.loop_preheader
  
-; KERNEL-IR-LABEL: polly.loop_exit:
+; KERNEL-IR-LABEL: polly.loop_exit:                                  ; preds = %polly.stmt.bb5
  ; KERNEL-IR-NEXT:   ret void
  
-; KERNEL-IR-LABEL: polly.loop_header:
-; KERNEL-IR-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
+; KERNEL-IR-LABEL: polly.loop_header:                                ; preds = %polly.stmt.bb5, %polly.loop_preheader
+; KERNEL-IR-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ]
+; KERNEL-IR-NEXT:   %4 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %5 = add nsw i64 %4, %t0
+; KERNEL-IR-NEXT:   %6 = mul nsw i64 32, %b1
+; KERNEL-IR-NEXT:   %7 = add nsw i64 %6, %t1
+; KERNEL-IR-NEXT:   %8 = mul nsw i64 16, %polly.indvar
+; KERNEL-IR-NEXT:   %9 = add nsw i64 %7, %8
+; KERNEL-IR-NEXT:   br label %polly.stmt.bb5
+
+; KERNEL-IR-LABEL: polly.stmt.bb5:                                   ; preds = %polly.loop_header
+; KERNEL-IR-NEXT:   %10 = mul i64 %9, %5
+; KERNEL-IR-NEXT:   %p_tmp6 = sitofp i64 %10 to float
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A = bitcast i8* %MemRef_A to float*
+; KERNEL-IR-NEXT:   %11 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %12 = add nsw i64 %11, %t0
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024
+; KERNEL-IR-NEXT:   %13 = mul nsw i64 32, %b1
+; KERNEL-IR-NEXT:   %14 = add nsw i64 %13, %t1
+; KERNEL-IR-NEXT:   %15 = mul nsw i64 16, %polly.indvar
+; KERNEL-IR-NEXT:   %16 = add nsw i64 %14, %15
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A = getelementptr float, float* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A
+; KERNEL-IR-NEXT:   %tmp8_p_scalar_ = load float, float* %polly.access.MemRef_A, align 4
+; KERNEL-IR-NEXT:   %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A1 = bitcast i8* %MemRef_A to float*
+; KERNEL-IR-NEXT:   %17 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %18 = add nsw i64 %17, %t0
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024
+; KERNEL-IR-NEXT:   %19 = mul nsw i64 32, %b1
+; KERNEL-IR-NEXT:   %20 = add nsw i64 %19, %t1
+; KERNEL-IR-NEXT:   %21 = mul nsw i64 16, %polly.indvar
+; KERNEL-IR-NEXT:   %22 = add nsw i64 %20, %21
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A4 = getelementptr float, float* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A3
+; KERNEL-IR-NEXT:   store float %p_tmp9, float* %polly.access.MemRef_A4, align 4
  ; KERNEL-IR-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
  ; KERNEL-IR-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar, 0
  ; KERNEL-IR-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
  
-; KERNEL-IR-LABEL: polly.loop_preheader:
+; KERNEL-IR-LABEL: polly.loop_preheader:                             ; preds = %entry
  ; KERNEL-IR-NEXT:   br label %polly.loop_header
-; KERNEL-IR-NEXT: }
+
  
  ;    void double_parallel_loop(float A[][1024]) {
  ;      for (long i = 0; i < 1024; i++)
diff --git a/polly/test/GPGPU/host-control-flow.ll b/polly/test/GPGPU/host-control-flow.ll

index d3ec4bb..911d5cb 100644 (file)
--- a/polly/test/GPGPU/host-control-flow.ll
+++ b/polly/test/GPGPU/host-control-flow.ll
@@ -34,27 +34,82 @@
  ; IR-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar, 98
  ; IR-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
  
-; KERNEL-IR-LABEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, i64 %c0) {
-; KERNEL-IR-NEXT: entry:
+; KERNEL-IR: define ptx_kernel void @kernel_0(i8* %MemRef_A, i64 %c0) {
+; KERNEL-IR-LABEL: entry:
  ; KERNEL-IR-NEXT:   %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  ; KERNEL-IR-NEXT:   %b0 = zext i32 %0 to i64
  ; KERNEL-IR-NEXT:   %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ; KERNEL-IR-NEXT:   %t0 = zext i32 %1 to i64
  ; KERNEL-IR-NEXT:   br label %polly.cond
  
-; KERNEL-IR-LABEL: polly.cond:
+; KERNEL-IR-LABEL: polly.cond:                                       ; preds = %entry
  ; KERNEL-IR-NEXT:   %2 = mul nsw i64 32, %b0
  ; KERNEL-IR-NEXT:   %3 = add nsw i64 %2, %t0
  ; KERNEL-IR-NEXT:   %4 = icmp sle i64 %3, 97
  ; KERNEL-IR-NEXT:   br i1 %4, label %polly.then, label %polly.else
  
-; KERNEL-IR-LABEL: polly.merge:
+; KERNEL-IR-LABEL: polly.merge:                                      ; preds = %polly.else, %polly.stmt.for.body3
  ; KERNEL-IR-NEXT:   ret void
  
-; KERNEL-IR-LABEL: polly.then:
+; KERNEL-IR-LABEL: polly.then:                                       ; preds = %polly.cond
+; KERNEL-IR-NEXT:   %5 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %6 = add nsw i64 %5, %t0
+; KERNEL-IR-NEXT:   br label %polly.stmt.for.body3
+
+; KERNEL-IR-LABEL: polly.stmt.for.body3:                             ; preds = %polly.then
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A = bitcast i8* %MemRef_A to float*
+; KERNEL-IR-NEXT:   %pexp.pdiv_r = urem i64 %c0, 2
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A = mul nsw i64 %pexp.pdiv_r, 100
+; KERNEL-IR-NEXT:   %7 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %8 = add nsw i64 %7, %t0
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %8
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A = getelementptr float, float* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A
+; KERNEL-IR-NEXT:   %tmp_p_scalar_ = load float, float* %polly.access.MemRef_A, align 4
+; KERNEL-IR-NEXT:   %9 = add i64 %6, 1
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A1 = bitcast i8* %MemRef_A to float*
+; KERNEL-IR-NEXT:   %pexp.pdiv_r2 = urem i64 %c0, 2
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A3 = mul nsw i64 %pexp.pdiv_r2, 100
+; KERNEL-IR-NEXT:   %10 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %11 = add nsw i64 %10, %t0
+; KERNEL-IR-NEXT:   %12 = add nsw i64 %11, 1
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A4 = add nsw i64 %polly.access.mul.MemRef_A3, %12
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A5 = getelementptr float, float* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A4
+; KERNEL-IR-NEXT:   %tmp2_p_scalar_ = load float, float* %polly.access.MemRef_A5, align 4
+; KERNEL-IR-NEXT:   %p_add = fadd float %tmp_p_scalar_, %tmp2_p_scalar_
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A6 = bitcast i8* %MemRef_A to float*
+; KERNEL-IR-NEXT:   %pexp.pdiv_r7 = urem i64 %c0, 2
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A8 = mul nsw i64 %pexp.pdiv_r7, 100
+; KERNEL-IR-NEXT:   %13 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %14 = add nsw i64 %13, %t0
+; KERNEL-IR-NEXT:   %15 = add nsw i64 %14, 2
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A9 = add nsw i64 %polly.access.mul.MemRef_A8, %15
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A10 = getelementptr float, float* %polly.access.cast.MemRef_A6, i64 %polly.access.add.MemRef_A9
+; KERNEL-IR-NEXT:   %tmp3_p_scalar_ = load float, float* %polly.access.MemRef_A10, align 4
+; KERNEL-IR-NEXT:   %p_add12 = fadd float %p_add, %tmp3_p_scalar_
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A11 = bitcast i8* %MemRef_A to float*
+; KERNEL-IR-NEXT:   %16 = add nsw i64 %c0, 1
+; KERNEL-IR-NEXT:   %pexp.pdiv_r12 = urem i64 %16, 2
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A13 = mul nsw i64 %pexp.pdiv_r12, 100
+; KERNEL-IR-NEXT:   %17 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %18 = add nsw i64 %17, %t0
+; KERNEL-IR-NEXT:   %19 = add nsw i64 %18, 1
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A14 = add nsw i64 %polly.access.mul.MemRef_A13, %19
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A15 = getelementptr float, float* %polly.access.cast.MemRef_A11, i64 %polly.access.add.MemRef_A14
+; KERNEL-IR-NEXT:   %tmp4_p_scalar_ = load float, float* %polly.access.MemRef_A15, align 4
+; KERNEL-IR-NEXT:   %p_add17 = fadd float %tmp4_p_scalar_, %p_add12
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A16 = bitcast i8* %MemRef_A to float*
+; KERNEL-IR-NEXT:   %20 = add nsw i64 %c0, 1
+; KERNEL-IR-NEXT:   %pexp.pdiv_r17 = urem i64 %20, 2
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A18 = mul nsw i64 %pexp.pdiv_r17, 100
+; KERNEL-IR-NEXT:   %21 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %22 = add nsw i64 %21, %t0
+; KERNEL-IR-NEXT:   %23 = add nsw i64 %22, 1
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A19 = add nsw i64 %polly.access.mul.MemRef_A18, %23
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A20 = getelementptr float, float* %polly.access.cast.MemRef_A16, i64 %polly.access.add.MemRef_A19
+; KERNEL-IR-NEXT:   store float %p_add17, float* %polly.access.MemRef_A20, align 4
  ; KERNEL-IR-NEXT:   br label %polly.merge
  
-; KERNEL-IR-LABEL: polly.else:
+; KERNEL-IR-LABEL: polly.else:                                       ; preds = %polly.cond
  ; KERNEL-IR-NEXT:   br label %polly.merge
  ; KERNEL-IR-NEXT: }
  
diff --git a/polly/test/GPGPU/kernel-params-only-some-arrays.ll b/polly/test/GPGPU/kernel-params-only-some-arrays.ll

index b7eafb7..0d36c54 100644 (file)
--- a/polly/test/GPGPU/kernel-params-only-some-arrays.ll
+++ b/polly/test/GPGPU/kernel-params-only-some-arrays.ll
@@ -23,7 +23,8 @@
  ; KERNEL-NEXT:     %b0 = zext i32 %0 to i64
  ; KERNEL-NEXT:     %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ; KERNEL-NEXT:     %t0 = zext i32 %1 to i64
-; KERNEL-NEXT:     ret void
+
+; KERNEL:     ret void
  ; KERNEL-NEXT: }
  
  ; KERNEL: ; ModuleID = 'kernel_1'
@@ -37,7 +38,8 @@
  ; KERNEL-NEXT:     %b0 = zext i32 %0 to i64
  ; KERNEL-NEXT:     %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ; KERNEL-NEXT:     %t0 = zext i32 %1 to i64
-; KERNEL-NEXT:     ret void
+
+; KERNEL:     ret void
  ; KERNEL-NEXT: }
  
  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
author	Tobias Grosser <tobias@grosser.es>
	Thu, 21 Jul 2016 13:15:59 +0000 (13:15 +0000)
committer	Tobias Grosser <tobias@grosser.es>
	Thu, 21 Jul 2016 13:15:59 +0000 (13:15 +0000)
polly/lib/CodeGen/PPCGCodeGeneration.cpp		patch \| blob \| history
polly/test/GPGPU/double-parallel-loop.ll		patch \| blob \| history
polly/test/GPGPU/host-control-flow.ll		patch \| blob \| history
polly/test/GPGPU/kernel-params-only-some-arrays.ll		patch \| blob \| history