[OpenMP][AMDGCN] Support OpenMP offloading for AMDGCN architecture - Part 3

author Saiyedul Islam <Saiyedul.Islam@amd.com>

Mon, 3 Aug 2020 05:29:48 +0000 (05:29 +0000)

committer Saiyedul Islam <Saiyedul.Islam@amd.com>

Mon, 3 Aug 2020 05:38:39 +0000 (05:38 +0000)
author Saiyedul Islam <Saiyedul.Islam@amd.com>
Mon, 3 Aug 2020 05:29:48 +0000 (05:29 +0000)
committer Saiyedul Islam <Saiyedul.Islam@amd.com>
Mon, 3 Aug 2020 05:38:39 +0000 (05:38 +0000)
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp

index ab29e32929cebf74e3254520a923fef4d777da89..5d74d91065f56e33200ae5908b394e175142f17e 100644 (file)
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -125,8 +125,13 @@ Address CodeGenFunction::CreateDefaultAlignTempAlloca(llvm::Type *Ty,
  }
  
  void CodeGenFunction::InitTempAlloca(Address Var, llvm::Value *Init) {
-  assert(isa<llvm::AllocaInst>(Var.getPointer()));
-  auto *Store = new llvm::StoreInst(Init, Var.getPointer(), /*volatile*/ false,
+  auto *Alloca = Var.getPointer();
+  assert(isa<llvm::AllocaInst>(Alloca) ||
+         (isa<llvm::AddrSpaceCastInst>(Alloca) &&
+          isa<llvm::AllocaInst>(
+              cast<llvm::AddrSpaceCastInst>(Alloca)->getPointerOperand())));
+
+  auto *Store = new llvm::StoreInst(Init, Alloca, /*volatile*/ false,
                                      Var.getAlignment().getAsAlign());
    llvm::BasicBlock *Block = AllocaInsertPt->getParent();
    Block->getInstList().insertAfter(AllocaInsertPt->getIterator(), Store);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp

new file mode 100644 (file)

index 0000000..ccffdf4
--- /dev/null
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
@@ -0,0 +1,61 @@
+//===-- CGOpenMPRuntimeAMDGCN.cpp - Interface to OpenMP AMDGCN Runtimes --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides a class for OpenMP runtime code generation specialized to
+// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CGOpenMPRuntimeAMDGCN.h"
+#include "CGOpenMPRuntimeGPU.h"
+#include "CodeGenFunction.h"
+#include "clang/AST/Attr.h"
+#include "clang/AST/DeclOpenMP.h"
+#include "clang/AST/StmtOpenMP.h"
+#include "clang/AST/StmtVisitor.h"
+#include "clang/Basic/Cuda.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+
+using namespace clang;
+using namespace CodeGen;
+using namespace llvm::omp;
+
+CGOpenMPRuntimeAMDGCN::CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM)
+    : CGOpenMPRuntimeGPU(CGM) {
+  if (!CGM.getLangOpts().OpenMPIsDevice)
+    llvm_unreachable("OpenMP AMDGCN can only handle device code.");
+}
+
+llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  // return constant compile-time target-specific warp size
+  unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
+  return Bld.getInt32(WarpSize);
+}
+
+llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUThreadID(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  llvm::Function *F =
+      CGF.CGM.getIntrinsic(llvm::Intrinsic::amdgcn_workitem_id_x);
+  return Bld.CreateCall(F, llvm::None, "nvptx_tid");
+}
+
+llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUNumThreads(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  llvm::Module *M = &CGF.CGM.getModule();
+  const char *LocSize = "__ockl_get_local_size";
+  llvm::Function *F = M->getFunction(LocSize);
+  if (!F) {
+    F = llvm::Function::Create(
+        llvm::FunctionType::get(CGF.Int64Ty, {CGF.Int32Ty}, false),
+        llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
+  }
+  return Bld.CreateTrunc(
+      Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty);
+}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h

new file mode 100644 (file)

index 0000000..c142126
--- /dev/null
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
@@ -0,0 +1,43 @@
+//===--- CGOpenMPRuntimeAMDGCN.h - Interface to OpenMP AMDGCN Runtimes ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides a class for OpenMP runtime code generation specialized to
+// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
+#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
+
+#include "CGOpenMPRuntime.h"
+#include "CGOpenMPRuntimeGPU.h"
+#include "CodeGenFunction.h"
+#include "clang/AST/StmtOpenMP.h"
+
+namespace clang {
+namespace CodeGen {
+
+class CGOpenMPRuntimeAMDGCN final : public CGOpenMPRuntimeGPU {
+
+public:
+  explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM);
+
+  /// Get the GPU warp size.
+  llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
+
+  /// Get the id of the current thread on the GPU.
+  llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;
+
+  /// Get the maximum number of threads in a block of the GPU.
+  llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
+};
+
+} // namespace CodeGen
+} // namespace clang
+
+#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

index 1cd89c540f4784369e0bdac4210f1ff5ba6f37b2..452eb15eb8d1664991a82791b206917d5cf2bdee 100644 (file)
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -7,7 +7,7 @@
  //===----------------------------------------------------------------------===//
  //
  // This provides a generalized class for OpenMP runtime code generation
-// specialized by GPU target NVPTX.
+// specialized by GPU targets NVPTX and AMDGCN.
  //
  //===----------------------------------------------------------------------===//
  
@@ -621,14 +621,6 @@ public:
  };
  } // anonymous namespace
  
-/// Get the id of the current thread on the GPU.
-static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
-  return CGF.EmitRuntimeCall(
-      llvm::Intrinsic::getDeclaration(
-          &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
-      "nvptx_tid");
-}
-
  /// Get the id of the warp in the block.
  /// We assume that the warp size is 32, which is always the case
  /// on the NVPTX device, to generate more efficient code.
@@ -636,7 +628,8 @@ static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
    CGBuilderTy &Bld = CGF.Builder;
    unsigned LaneIDBits =
        CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2);
-  return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+  return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
  }
  
  /// Get the id of the current lane in the Warp.
@@ -646,18 +639,11 @@ static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
    CGBuilderTy &Bld = CGF.Builder;
    unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue(
        llvm::omp::GV_Warp_Size_Log2_Mask);
-  return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+  return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
                         "nvptx_lane_id");
  }
  
-/// Get the maximum number of threads in a block of the GPU.
-static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
-  return CGF.EmitRuntimeCall(
-      llvm::Intrinsic::getDeclaration(
-          &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
-      "nvptx_num_threads");
-}
-
  /// Get the value of the thread_limit clause in the teams directive.
  /// For the 'generic' execution mode, the runtime encodes thread_limit in
  /// the launch parameters, always starting thread_limit+warpSize threads per
@@ -668,9 +654,9 @@ static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
    CGBuilderTy &Bld = CGF.Builder;
    auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
    return IsInSPMDExecutionMode
-             ? getNVPTXNumThreads(CGF)
-             : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), RT.getGPUWarpSize(CGF),
-                                "thread_limit");
+             ? RT.getGPUNumThreads(CGF)
+             : Bld.CreateNUWSub(RT.getGPUNumThreads(CGF),
+                                RT.getGPUWarpSize(CGF), "thread_limit");
  }
  
  /// Get the thread id of the OMP master thread.
@@ -682,8 +668,8 @@ static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
  ///      If NumThreads is 1024, master id is 992.
  static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
    CGBuilderTy &Bld = CGF.Builder;
-  llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
    auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+  llvm::Value *NumThreads = RT.getGPUNumThreads(CGF);
    // We assume that the warp size is a power of 2.
    llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1));
  
@@ -1235,8 +1221,9 @@ void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
    llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
    EST.ExitBB = CGF.createBasicBlock(".exit");
  
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
    llvm::Value *IsWorker =
-      Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
+      Bld.CreateICmpULT(RT.getGPUThreadID(CGF), getThreadLimit(CGF));
    Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
  
    CGF.EmitBlock(WorkerBB);
@@ -1245,7 +1232,7 @@ void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
  
    CGF.EmitBlock(MasterCheckBB);
    llvm::Value *IsMaster =
-      Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
+      Bld.CreateICmpEQ(RT.getGPUThreadID(CGF), getMasterThreadID(CGF));
    Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
  
    CGF.EmitBlock(MasterBB);
@@ -2780,14 +2767,16 @@ void CGOpenMPRuntimeGPU::emitCriticalRegion(
    llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
    llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
  
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+
    // Get the mask of active threads in the warp.
    llvm::Value *Mask = CGF.EmitRuntimeCall(
        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_warp_active_thread_mask));
    // Fetch team-local id of the thread.
-  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
+  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
  
    // Get the width of the team.
-  llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
+  llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
  
    // Initialize the counter variable for the loop.
    QualType Int32Ty =
@@ -3250,8 +3239,9 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
      CGM.addCompilerUsedGlobal(TransferMedium);
    }
  
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
    // Get the CUDA thread id of the current OpenMP thread on the GPU.
-  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
+  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
    // nvptx_lane_id = nvptx_id % warpsize
    llvm::Value *LaneID = getNVPTXLaneID(CGF);
    // nvptx_warp_id = nvptx_id / warpsize
@@ -4844,9 +4834,11 @@ void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
      CodeGenFunction &CGF, const OMPLoopDirective &S,
      OpenMPDistScheduleClauseKind &ScheduleKind,
      llvm::Value *&Chunk) const {
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
    if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
      ScheduleKind = OMPC_DIST_SCHEDULE_static;
-    Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
+    Chunk = CGF.EmitScalarConversion(
+        RT.getGPUNumThreads(CGF),
          CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
          S.getIterationVariable()->getType(), S.getBeginLoc());
      return;
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h

index 316333072c5bc44aefa85810b8d28e1de6c1458a..7267511ca672ab0f938552c3826cc76576697996 100644 (file)
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -7,7 +7,7 @@
  //===----------------------------------------------------------------------===//
  //
  // This provides a generalized class for OpenMP runtime code generation
-// specialized by GPU target NVPTX.
+// specialized by GPU targets NVPTX and AMDGCN.
  //
  //===----------------------------------------------------------------------===//
  
@@ -199,9 +199,18 @@ public:
    void clear() override;
  
    /// Declare generalized virtual functions which need to be defined
-  /// by all specializations of OpenMPGPURuntime Targets.
+  /// by all specializations of OpenMPGPURuntime Targets like AMDGCN
+  /// and NVPTX.
+
+  /// Get the GPU warp size.
    virtual llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) = 0;
  
+  /// Get the id of the current thread on the GPU.
+  virtual llvm::Value *getGPUThreadID(CodeGenFunction &CGF) = 0;
+
+  /// Get the maximum number of threads in a block of the GPU.
+  virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0;
+
    /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32
    /// global_tid, int proc_bind) to generate code for 'proc_bind' clause.
    virtual void emitProcBindClause(CodeGenFunction &CGF,
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp

index 5fefc95ee41301706d116d2d09770775ede99105..1688d07b90b6e873f3bc5e54178fe64f5667da29 100644 (file)
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -32,10 +32,25 @@ CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
      llvm_unreachable("OpenMP NVPTX can only handle device code.");
  }
  
-/// Get the GPU warp size.
  llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) {
    return CGF.EmitRuntimeCall(
        llvm::Intrinsic::getDeclaration(
            &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
        "nvptx_warp_size");
  }
+
+llvm::Value *CGOpenMPRuntimeNVPTX::getGPUThreadID(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  llvm::Function *F;
+  F = llvm::Intrinsic::getDeclaration(
+      &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x);
+  return Bld.CreateCall(F, llvm::None, "nvptx_tid");
+}
+
+llvm::Value *CGOpenMPRuntimeNVPTX::getGPUNumThreads(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  llvm::Function *F;
+  F = llvm::Intrinsic::getDeclaration(
+      &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
+  return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
+}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h

index 6dab79e6e20aeb44ff8d93053f3fec90a04aa374..5f16029592665be6ebcdb5075a50bddb0ee9b126 100644 (file)
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -22,11 +22,19 @@
  namespace clang {
  namespace CodeGen {
  
-class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntimeGPU {
+class CGOpenMPRuntimeNVPTX final : public CGOpenMPRuntimeGPU {
  
  public:
    explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);
+
+  /// Get the GPU warp size.
    llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
+
+  /// Get the id of the current thread on the GPU.
+  llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;
+
+  /// Get the maximum number of threads in a block of the GPU.
+  llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
  };
  
  } // CodeGen namespace.
diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt

index 88647a2007fb026aab83242575daa142121dea33..f47ecd9bf8465be043707a50cea22f3dc37d6940 100644 (file)
--- a/clang/lib/CodeGen/CMakeLists.txt
+++ b/clang/lib/CodeGen/CMakeLists.txt
@@ -58,6 +58,7 @@ add_clang_library(clangCodeGen
    CGObjCRuntime.cpp
    CGOpenCLRuntime.cpp
    CGOpenMPRuntime.cpp
+  CGOpenMPRuntimeAMDGCN.cpp
    CGOpenMPRuntimeGPU.cpp
    CGOpenMPRuntimeNVPTX.cpp
    CGRecordLayoutBuilder.cpp
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp

index 48a1dddfb3315b91e97221516f25d7ee86b56fa7..f3712ea1f541d8e24da732a0cdd0973354325dbf 100644 (file)
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -19,6 +19,7 @@
  #include "CGObjCRuntime.h"
  #include "CGOpenCLRuntime.h"
  #include "CGOpenMPRuntime.h"
+#include "CGOpenMPRuntimeAMDGCN.h"
  #include "CGOpenMPRuntimeNVPTX.h"
  #include "CodeGenFunction.h"
  #include "CodeGenPGO.h"
@@ -215,6 +216,11 @@ void CodeGenModule::createOpenMPRuntime() {
             "OpenMP NVPTX is only prepared to deal with device code.");
      OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));
      break;
+  case llvm::Triple::amdgcn:
+    assert(getLangOpts().OpenMPIsDevice &&
+           "OpenMP AMDGCN is only prepared to deal with device code.");
+    OpenMPRuntime.reset(new CGOpenMPRuntimeAMDGCN(*this));
+    break;
    default:
      if (LangOpts.OpenMPSimd)
        OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this));
diff --git a/clang/test/OpenMP/amdgcn_target_codegen.cpp b/clang/test/OpenMP/amdgcn_target_codegen.cpp

new file mode 100644 (file)

index 0000000..0b6f2d4
--- /dev/null
+++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp
@@ -0,0 +1,43 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+#define N 1000
+
+int test_amdgcn_target_tid_threads() {
+// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads
+
+  int arr[N];
+
+// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0)
+// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32
+// CHECK-NEXT: sub nuw i32 [[VAR]], 64
+// CHECK: call i32 @llvm.amdgcn.workitem.id.x()
+#pragma omp target
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+
+  return arr[0];
+}
+
+int test_amdgcn_target_tid_threads_simd() {
+// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads_simd
+
+  int arr[N];
+
+// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0)
+// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32
+// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[VAR]], i16 0, i16 0)
+#pragma omp target simd
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+  return arr[0];
+}
+
+#endif
diff --git a/clang/test/OpenMP/amdgcn_target_init_temp_alloca.cpp b/clang/test/OpenMP/amdgcn_target_init_temp_alloca.cpp

new file mode 100644 (file)

index 0000000..4ed953a
--- /dev/null
+++ b/clang/test/OpenMP/amdgcn_target_init_temp_alloca.cpp
@@ -0,0 +1,24 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+
+#define N 100
+
+int test_amdgcn_target_temp_alloca() {
+  // CHECK-LABEL: test_amdgcn_target_temp_alloca
+
+  int arr[N];
+
+  // CHECK:      [[VAR_ADDR:%.+]] = alloca [100 x i32]*, align 8, addrspace(5)
+  // CHECK-NEXT: [[VAR_ADDR_CAST:%.+]] = addrspacecast [100 x i32]* addrspace(5)* [[VAR_ADDR]] to [100 x i32]**
+  // CHECK:  store [100 x i32]* [[VAR:%.+]], [100 x i32]** [[VAR_ADDR_CAST]], align 8
+
+#pragma omp target
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+
+  return arr[0];
+}
author	Saiyedul Islam <Saiyedul.Islam@amd.com>
	Mon, 3 Aug 2020 05:29:48 +0000 (05:29 +0000)
committer	Saiyedul Islam <Saiyedul.Islam@amd.com>
	Mon, 3 Aug 2020 05:38:39 +0000 (05:38 +0000)
clang/lib/CodeGen/CGExpr.cpp		patch \| blob \| history
clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp	[new file with mode: 0644]	patch \| blob
clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h	[new file with mode: 0644]	patch \| blob
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp		patch \| blob \| history
clang/lib/CodeGen/CGOpenMPRuntimeGPU.h		patch \| blob \| history
clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp		patch \| blob \| history
clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h		patch \| blob \| history
clang/lib/CodeGen/CMakeLists.txt		patch \| blob \| history
clang/lib/CodeGen/CodeGenModule.cpp		patch \| blob \| history
clang/test/OpenMP/amdgcn_target_codegen.cpp	[new file with mode: 0644]	patch \| blob
clang/test/OpenMP/amdgcn_target_init_temp_alloca.cpp	[new file with mode: 0644]	patch \| blob