}
void CodeGenFunction::InitTempAlloca(Address Var, llvm::Value *Init) {
- assert(isa<llvm::AllocaInst>(Var.getPointer()));
- auto *Store = new llvm::StoreInst(Init, Var.getPointer(), /*volatile*/ false,
+ auto *Alloca = Var.getPointer();
+ assert(isa<llvm::AllocaInst>(Alloca) ||
+ (isa<llvm::AddrSpaceCastInst>(Alloca) &&
+ isa<llvm::AllocaInst>(
+ cast<llvm::AddrSpaceCastInst>(Alloca)->getPointerOperand())));
+
+ auto *Store = new llvm::StoreInst(Init, Alloca, /*volatile*/ false,
Var.getAlignment().getAsAlign());
llvm::BasicBlock *Block = AllocaInsertPt->getParent();
Block->getInstList().insertAfter(AllocaInsertPt->getIterator(), Store);
--- /dev/null
+//===-- CGOpenMPRuntimeAMDGCN.cpp - Interface to OpenMP AMDGCN Runtimes --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides a class for OpenMP runtime code generation specialized to
+// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CGOpenMPRuntimeAMDGCN.h"
+#include "CGOpenMPRuntimeGPU.h"
+#include "CodeGenFunction.h"
+#include "clang/AST/Attr.h"
+#include "clang/AST/DeclOpenMP.h"
+#include "clang/AST/StmtOpenMP.h"
+#include "clang/AST/StmtVisitor.h"
+#include "clang/Basic/Cuda.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+
+using namespace clang;
+using namespace CodeGen;
+using namespace llvm::omp;
+
+CGOpenMPRuntimeAMDGCN::CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM)
+ : CGOpenMPRuntimeGPU(CGM) {
+ if (!CGM.getLangOpts().OpenMPIsDevice)
+ llvm_unreachable("OpenMP AMDGCN can only handle device code.");
+}
+
+llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) {
+ CGBuilderTy &Bld = CGF.Builder;
+ // return constant compile-time target-specific warp size
+ unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
+ return Bld.getInt32(WarpSize);
+}
+
+llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUThreadID(CodeGenFunction &CGF) {
+ CGBuilderTy &Bld = CGF.Builder;
+ llvm::Function *F =
+ CGF.CGM.getIntrinsic(llvm::Intrinsic::amdgcn_workitem_id_x);
+ return Bld.CreateCall(F, llvm::None, "nvptx_tid");
+}
+
+llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUNumThreads(CodeGenFunction &CGF) {
+ CGBuilderTy &Bld = CGF.Builder;
+ llvm::Module *M = &CGF.CGM.getModule();
+ const char *LocSize = "__ockl_get_local_size";
+ llvm::Function *F = M->getFunction(LocSize);
+ if (!F) {
+ F = llvm::Function::Create(
+ llvm::FunctionType::get(CGF.Int64Ty, {CGF.Int32Ty}, false),
+ llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
+ }
+ return Bld.CreateTrunc(
+ Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty);
+}
--- /dev/null
+//===--- CGOpenMPRuntimeAMDGCN.h - Interface to OpenMP AMDGCN Runtimes ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides a class for OpenMP runtime code generation specialized to
+// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
+#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
+
+#include "CGOpenMPRuntime.h"
+#include "CGOpenMPRuntimeGPU.h"
+#include "CodeGenFunction.h"
+#include "clang/AST/StmtOpenMP.h"
+
+namespace clang {
+namespace CodeGen {
+
+class CGOpenMPRuntimeAMDGCN final : public CGOpenMPRuntimeGPU {
+
+public:
+ explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM);
+
+ /// Get the GPU warp size.
+ llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
+
+ /// Get the id of the current thread on the GPU.
+ llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;
+
+ /// Get the maximum number of threads in a block of the GPU.
+ llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
+};
+
+} // namespace CodeGen
+} // namespace clang
+
+#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
//===----------------------------------------------------------------------===//
//
// This provides a generalized class for OpenMP runtime code generation
-// specialized by GPU target NVPTX.
+// specialized by GPU targets NVPTX and AMDGCN.
//
//===----------------------------------------------------------------------===//
};
} // anonymous namespace
-/// Get the id of the current thread on the GPU.
-static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
- return CGF.EmitRuntimeCall(
- llvm::Intrinsic::getDeclaration(
- &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
- "nvptx_tid");
-}
-
/// Get the id of the warp in the block.
/// We assume that the warp size is 32, which is always the case
/// on the NVPTX device, to generate more efficient code.
CGBuilderTy &Bld = CGF.Builder;
unsigned LaneIDBits =
CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2);
- return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
+ auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+ return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
}
/// Get the id of the current lane in the Warp.
CGBuilderTy &Bld = CGF.Builder;
unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue(
llvm::omp::GV_Warp_Size_Log2_Mask);
- return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
+ auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+ return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
"nvptx_lane_id");
}
-/// Get the maximum number of threads in a block of the GPU.
-static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
- return CGF.EmitRuntimeCall(
- llvm::Intrinsic::getDeclaration(
- &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
- "nvptx_num_threads");
-}
-
/// Get the value of the thread_limit clause in the teams directive.
/// For the 'generic' execution mode, the runtime encodes thread_limit in
/// the launch parameters, always starting thread_limit+warpSize threads per
CGBuilderTy &Bld = CGF.Builder;
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
return IsInSPMDExecutionMode
- ? getNVPTXNumThreads(CGF)
- : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), RT.getGPUWarpSize(CGF),
- "thread_limit");
+ ? RT.getGPUNumThreads(CGF)
+ : Bld.CreateNUWSub(RT.getGPUNumThreads(CGF),
+ RT.getGPUWarpSize(CGF), "thread_limit");
}
/// Get the thread id of the OMP master thread.
/// If NumThreads is 1024, master id is 992.
static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
- llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+ llvm::Value *NumThreads = RT.getGPUNumThreads(CGF);
// We assume that the warp size is a power of 2.
llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1));
llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
EST.ExitBB = CGF.createBasicBlock(".exit");
+ auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
llvm::Value *IsWorker =
- Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
+ Bld.CreateICmpULT(RT.getGPUThreadID(CGF), getThreadLimit(CGF));
Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
CGF.EmitBlock(WorkerBB);
CGF.EmitBlock(MasterCheckBB);
llvm::Value *IsMaster =
- Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
+ Bld.CreateICmpEQ(RT.getGPUThreadID(CGF), getMasterThreadID(CGF));
Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
CGF.EmitBlock(MasterBB);
llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
+ auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+
// Get the mask of active threads in the warp.
llvm::Value *Mask = CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_warp_active_thread_mask));
// Fetch team-local id of the thread.
- llvm::Value *ThreadID = getNVPTXThreadID(CGF);
+ llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
// Get the width of the team.
- llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
+ llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
// Initialize the counter variable for the loop.
QualType Int32Ty =
CGM.addCompilerUsedGlobal(TransferMedium);
}
+ auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
// Get the CUDA thread id of the current OpenMP thread on the GPU.
- llvm::Value *ThreadID = getNVPTXThreadID(CGF);
+ llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
// nvptx_lane_id = nvptx_id % warpsize
llvm::Value *LaneID = getNVPTXLaneID(CGF);
// nvptx_warp_id = nvptx_id / warpsize
CodeGenFunction &CGF, const OMPLoopDirective &S,
OpenMPDistScheduleClauseKind &ScheduleKind,
llvm::Value *&Chunk) const {
+ auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
ScheduleKind = OMPC_DIST_SCHEDULE_static;
- Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
+ Chunk = CGF.EmitScalarConversion(
+ RT.getGPUNumThreads(CGF),
CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
S.getIterationVariable()->getType(), S.getBeginLoc());
return;
//===----------------------------------------------------------------------===//
//
// This provides a generalized class for OpenMP runtime code generation
-// specialized by GPU target NVPTX.
+// specialized by GPU targets NVPTX and AMDGCN.
//
//===----------------------------------------------------------------------===//
void clear() override;
/// Declare generalized virtual functions which need to be defined
- /// by all specializations of OpenMPGPURuntime Targets.
+ /// by all specializations of OpenMPGPURuntime Targets like AMDGCN
+ /// and NVPTX.
+
+ /// Get the GPU warp size.
virtual llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) = 0;
+ /// Get the id of the current thread on the GPU.
+ virtual llvm::Value *getGPUThreadID(CodeGenFunction &CGF) = 0;
+
+ /// Get the maximum number of threads in a block of the GPU.
+ virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0;
+
/// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32
/// global_tid, int proc_bind) to generate code for 'proc_bind' clause.
virtual void emitProcBindClause(CodeGenFunction &CGF,
llvm_unreachable("OpenMP NVPTX can only handle device code.");
}
-/// Get the GPU warp size.
llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) {
return CGF.EmitRuntimeCall(
llvm::Intrinsic::getDeclaration(
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
"nvptx_warp_size");
}
+
+llvm::Value *CGOpenMPRuntimeNVPTX::getGPUThreadID(CodeGenFunction &CGF) {
+ CGBuilderTy &Bld = CGF.Builder;
+ llvm::Function *F;
+ F = llvm::Intrinsic::getDeclaration(
+ &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x);
+ return Bld.CreateCall(F, llvm::None, "nvptx_tid");
+}
+
+llvm::Value *CGOpenMPRuntimeNVPTX::getGPUNumThreads(CodeGenFunction &CGF) {
+ CGBuilderTy &Bld = CGF.Builder;
+ llvm::Function *F;
+ F = llvm::Intrinsic::getDeclaration(
+ &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
+ return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
+}
namespace clang {
namespace CodeGen {
-class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntimeGPU {
+class CGOpenMPRuntimeNVPTX final : public CGOpenMPRuntimeGPU {
public:
explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);
+
+ /// Get the GPU warp size.
llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
+
+ /// Get the id of the current thread on the GPU.
+ llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;
+
+ /// Get the maximum number of threads in a block of the GPU.
+ llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
};
} // CodeGen namespace.
CGObjCRuntime.cpp
CGOpenCLRuntime.cpp
CGOpenMPRuntime.cpp
+ CGOpenMPRuntimeAMDGCN.cpp
CGOpenMPRuntimeGPU.cpp
CGOpenMPRuntimeNVPTX.cpp
CGRecordLayoutBuilder.cpp
#include "CGObjCRuntime.h"
#include "CGOpenCLRuntime.h"
#include "CGOpenMPRuntime.h"
+#include "CGOpenMPRuntimeAMDGCN.h"
#include "CGOpenMPRuntimeNVPTX.h"
#include "CodeGenFunction.h"
#include "CodeGenPGO.h"
"OpenMP NVPTX is only prepared to deal with device code.");
OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));
break;
+ case llvm::Triple::amdgcn:
+ assert(getLangOpts().OpenMPIsDevice &&
+ "OpenMP AMDGCN is only prepared to deal with device code.");
+ OpenMPRuntime.reset(new CGOpenMPRuntimeAMDGCN(*this));
+ break;
default:
if (LangOpts.OpenMPSimd)
OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this));
--- /dev/null
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+#define N 1000
+
+int test_amdgcn_target_tid_threads() {
+// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads
+
+ int arr[N];
+
+// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0)
+// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32
+// CHECK-NEXT: sub nuw i32 [[VAR]], 64
+// CHECK: call i32 @llvm.amdgcn.workitem.id.x()
+#pragma omp target
+ for (int i = 0; i < N; i++) {
+ arr[i] = 1;
+ }
+
+ return arr[0];
+}
+
+int test_amdgcn_target_tid_threads_simd() {
+// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads_simd
+
+ int arr[N];
+
+// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0)
+// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32
+// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[VAR]], i16 0, i16 0)
+#pragma omp target simd
+ for (int i = 0; i < N; i++) {
+ arr[i] = 1;
+ }
+ return arr[0];
+}
+
+#endif
--- /dev/null
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+
+#define N 100
+
+int test_amdgcn_target_temp_alloca() {
+ // CHECK-LABEL: test_amdgcn_target_temp_alloca
+
+ int arr[N];
+
+ // CHECK: [[VAR_ADDR:%.+]] = alloca [100 x i32]*, align 8, addrspace(5)
+ // CHECK-NEXT: [[VAR_ADDR_CAST:%.+]] = addrspacecast [100 x i32]* addrspace(5)* [[VAR_ADDR]] to [100 x i32]**
+ // CHECK: store [100 x i32]* [[VAR:%.+]], [100 x i32]** [[VAR_ADDR_CAST]], align 8
+
+#pragma omp target
+ for (int i = 0; i < N; i++) {
+ arr[i] = 1;
+ }
+
+ return arr[0];
+}