From 4f680db257733fe073e10f5e23838825b4b4cf47 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 19 Mar 2019 16:41:16 +0000 Subject: [PATCH] [OPENMP] Codegen for local variables with the allocate pragma. Added initial codegen for the local variables with the #pragma omp allocate directive. Instead of allocating the variables on the stack, __kmpc_alloc|__kmpc_free functions are used for memory (de-)allocation. llvm-svn: 356472 --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 97 ++++++++++++++++++++ clang/lib/CodeGen/CGOpenMPRuntime.h | 6 ++ clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 3 + clang/test/OpenMP/allocate_allocator_ast_print.cpp | 20 ++-- clang/test/OpenMP/allocate_codegen.cpp | 102 +++++++++++++++++++++ 5 files changed, 220 insertions(+), 8 deletions(-) create mode 100644 clang/test/OpenMP/allocate_codegen.cpp diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 4e6cfa6..12e2a52 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -667,6 +667,10 @@ enum OpenMPRTLFunction { // Call to void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void // *d); OMPRTL__kmpc_task_reduction_get_th_data, + // Call to void *__kmpc_alloc(int gtid, size_t sz, const omp_allocator_t *al); + OMPRTL__kmpc_alloc, + // Call to void __kmpc_free(int gtid, void *ptr, const omp_allocator_t *al); + OMPRTL__kmpc_free, // // Offloading related calls @@ -2195,6 +2199,26 @@ llvm::FunctionCallee CGOpenMPRuntime::createRuntimeFunction(unsigned Function) { FnTy, /*Name=*/"__kmpc_task_reduction_get_th_data"); break; } + case OMPRTL__kmpc_alloc: { + // Build to void *__kmpc_alloc(int gtid, size_t sz, const omp_allocator_t + // *al); + // omp_allocator_t type is void *. + llvm::Type *TypeParams[] = {CGM.IntTy, CGM.SizeTy, CGM.VoidPtrPtrTy}; + auto *FnTy = + llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_alloc"); + break; + } + case OMPRTL__kmpc_free: { + // Build to void __kmpc_free(int gtid, void *ptr, const omp_allocator_t + // *al); + // omp_allocator_t type is void *. + llvm::Type *TypeParams[] = {CGM.IntTy, CGM.VoidPtrTy, CGM.VoidPtrPtrTy}; + auto *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_free"); + break; + } case OMPRTL__kmpc_push_target_tripcount: { // Build void __kmpc_push_target_tripcount(int64_t device_id, kmp_uint64 // size); @@ -9669,8 +9693,81 @@ Address CGOpenMPRuntime::getParameterAddress(CodeGenFunction &CGF, return CGF.GetAddrOfLocalVar(NativeParam); } +namespace { +/// Cleanup action for allocate support. +class OMPAllocateCleanupTy final : public EHScopeStack::Cleanup { +public: + static const int CleanupArgs = 3; + +private: + llvm::FunctionCallee RTLFn; + llvm::Value *Args[CleanupArgs]; + +public: + OMPAllocateCleanupTy(llvm::FunctionCallee RTLFn, + ArrayRef CallArgs) + : RTLFn(RTLFn) { + assert(CallArgs.size() == CleanupArgs && + "Size of arguments does not match."); + std::copy(CallArgs.begin(), CallArgs.end(), std::begin(Args)); + } + void Emit(CodeGenFunction &CGF, Flags /*flags*/) override { + if (!CGF.HaveInsertPoint()) + return; + CGF.EmitRuntimeCall(RTLFn, Args); + } +}; +} // namespace + Address CGOpenMPRuntime::getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) { + const VarDecl *CVD = VD->getCanonicalDecl(); + if (!CVD->hasAttr()) + return Address::invalid(); + for (const Attr *A: CVD->getAttrs()) { + if (const auto *AA = dyn_cast(A)) { + auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn); + if (!Elem.second.ServiceInsertPt) + setLocThreadIdInsertPt(CGF); + CGBuilderTy::InsertPointGuard IPG(CGF.Builder); + CGF.Builder.SetInsertPoint(Elem.second.ServiceInsertPt); + llvm::Value *Size; + CharUnits Align = CGM.getContext().getDeclAlign(CVD); + if (CVD->getType()->isVariablyModifiedType()) { + Size = CGF.getTypeSize(CVD->getType()); + Align = CGM.getContext().getTypeAlignInChars(CVD->getType()); + } else { + CharUnits Sz = CGM.getContext().getTypeSizeInChars(CVD->getType()); + Align = CGM.getContext().getDeclAlign(CVD); + Size = CGM.getSize(Sz.alignTo(Align)); + } + llvm::Value *ThreadID = getThreadID(CGF, CVD->getBeginLoc()); + llvm::Value *Allocator; + if (const Expr *AllocExpr = AA->getAllocator()) { + Allocator = CGF.EmitScalarExpr(AA->getAllocator()); + } else { + // Default allocator in libomp is nullptr. + Allocator = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy); + } + llvm::Value *Args[] = {ThreadID, Size, Allocator}; + + llvm::Value *Addr = + CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_alloc), Args, + CVD->getName() + ".void.addr"); + llvm::Value *FiniArgs[OMPAllocateCleanupTy::CleanupArgs] = { + ThreadID, Addr, Allocator}; + llvm::FunctionCallee FiniRTLFn = createRuntimeFunction(OMPRTL__kmpc_free); + + CGF.EHStack.pushCleanup( + NormalAndEHCleanup, FiniRTLFn, llvm::makeArrayRef(FiniArgs)); + Addr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + Addr, + CGF.ConvertTypeForMem( + CGM.getContext().getPointerType(CVD->getType())), + CVD->getName() + ".addr"); + return Address(Addr, Align); + } + } return Address::invalid(); } diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h index 45d4d84..9d371c4 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -2163,6 +2163,12 @@ public: /// \param TargetParam Corresponding target-specific parameter. Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override; + + /// Gets the OpenMP-specific address of the local variable. + Address getAddressOfLocalVariable(CodeGenFunction &CGF, + const VarDecl *VD) override { + return Address::invalid(); + } }; } // namespace CodeGen diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index ba23c49..d35f56e 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -4746,6 +4746,9 @@ Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF, return VDI->second.PrivateAddr; } } + // TODO: replace it with return + // CGOpenMPRuntime::getAddressOfLocalVariable(CGF, VD); when NVPTX libomp + // supports __kmpc_alloc|__kmpc_free. return Address::invalid(); } diff --git a/clang/test/OpenMP/allocate_allocator_ast_print.cpp b/clang/test/OpenMP/allocate_allocator_ast_print.cpp index 7a42dda..a9e838d 100644 --- a/clang/test/OpenMP/allocate_allocator_ast_print.cpp +++ b/clang/test/OpenMP/allocate_allocator_ast_print.cpp @@ -1,16 +1,16 @@ -// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-apple-darwin10.6.0 -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-apple-darwin10.6.0 -ast-print %s -o - | FileCheck %s // RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10.6.0 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-linux-gnu -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-linux-gnu -ast-print %s -o - | FileCheck %s // RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print +// RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -ast-print %s -o - | FileCheck %s // RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -// RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-linux-gnu -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-linux-gnu -ast-print %s -o - | FileCheck %s // RUN: %clang_cc1 -fopenmp-simd -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print +// RUN: %clang_cc1 -fopenmp-simd -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -o - | FileCheck %s // expected-no-diagnostics #ifndef HEADER @@ -82,6 +82,10 @@ int main () { #pragma omp allocate(a) allocator(omp_thread_mem_alloc) // CHECK-NEXT: #pragma omp allocate(a) allocator(omp_thread_mem_alloc) a=2; + int b = 3; +// CHECK: int b = 3; +#pragma omp allocate(b) +// CHECK-NEXT: #pragma omp allocate(b) return (foo()); } diff --git a/clang/test/OpenMP/allocate_codegen.cpp b/clang/test/OpenMP/allocate_codegen.cpp new file mode 100644 index 0000000..eac2b9f --- /dev/null +++ b/clang/test/OpenMP/allocate_codegen.cpp @@ -0,0 +1,102 @@ +// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-apple-darwin10.6.0 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10.6.0 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -emit-llvm -o - %s | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// SIMD-ONLY0-NOT: {{__kmpc|__tgt}} +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +typedef void **omp_allocator_handle_t; +extern const omp_allocator_handle_t omp_default_mem_alloc; +extern const omp_allocator_handle_t omp_large_cap_mem_alloc; +extern const omp_allocator_handle_t omp_const_mem_alloc; +extern const omp_allocator_handle_t omp_high_bw_mem_alloc; +extern const omp_allocator_handle_t omp_low_lat_mem_alloc; +extern const omp_allocator_handle_t omp_cgroup_mem_alloc; +extern const omp_allocator_handle_t omp_pteam_mem_alloc; +extern const omp_allocator_handle_t omp_thread_mem_alloc; + +struct St{ + int a; +}; + +struct St1{ + int a; + static int b; +#pragma omp allocate(b) allocator(omp_default_mem_alloc) +} d; + +int a, b; +#pragma omp allocate(a) allocator(omp_large_cap_mem_alloc) +#pragma omp allocate(a) allocator(omp_const_mem_alloc) +#pragma omp allocate(d, b) allocator(omp_high_bw_mem_alloc) + +template +struct ST { + static T m; + #pragma omp allocate(m) allocator(omp_low_lat_mem_alloc) +}; + +template T foo() { + T v; + #pragma omp allocate(v) allocator(omp_cgroup_mem_alloc) + v = ST::m; + return v; +} + +namespace ns{ + int a; +} +#pragma omp allocate(ns::a) allocator(omp_pteam_mem_alloc) + +// CHECK-NOT: call {{.+}} {{__kmpc_alloc|__kmpc_free}} + +// CHECK-LABEL: @main +int main () { + static int a; +#pragma omp allocate(a) allocator(omp_thread_mem_alloc) + a=2; + // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}}) + // CHECK-NEXT: [[B_VOID_ADDR:%.+]] = call i8* @__kmpc_alloc(i32 [[GTID]], i64 8, i8** null) + // CHECK-NEXT: [[B_ADDR:%.+]] = bitcast i8* [[B_VOID_ADDR]] to double* + // CHECK-NOT: {{__kmpc_alloc|__kmpc_free}} + // CHECK: store double 3.000000e+00, double* [[B_ADDR]], + // CHECK: [[RES:%.+]] = call i32 [[FOO:@.+]]() + // CHECK: store i32 [[RES]], i32* [[RET:%.+]], + // CHECK-NEXT: call void @__kmpc_free(i32 [[GTID]], i8* [[B_VOID_ADDR]], i8** null) + // CHECK-NOT: {{__kmpc_alloc|__kmpc_free}} + double b = 3; +#pragma omp allocate(b) + // CHECK: [[RETVAL:%.+]] = load i32, i32* [[RET]], + // CHECK: ret i32 [[RETVAL]] + return (foo()); +} + +// CHECK-NOT: call {{.+}} {{__kmpc_alloc|__kmpc_free}} + +// CHECK: define {{.*}}i32 [[FOO]]() +// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}}) +// CHECK-NEXT: [[OMP_CGROUP_MEM_ALLOC:%.+]] = load i8**, i8*** @omp_cgroup_mem_alloc, +// CHECK-NEXT: [[V_VOID_ADDR:%.+]] = call i8* @__kmpc_alloc(i32 [[GTID]], i64 4, i8** [[OMP_CGROUP_MEM_ALLOC]]) +// CHECK-NEXT: [[V_ADDR:%.+]] = bitcast i8* [[V_VOID_ADDR]] to i32* +// CHECK-NOT: {{__kmpc_alloc|__kmpc_free}} +// CHECK: store i32 %{{.+}}, i32* [[V_ADDR]], +// CHECK-NEXT: [[V_VAL:%.+]] = load i32, i32* [[V_ADDR]], +// CHECK-NEXT: call void @__kmpc_free(i32 [[GTID]], i8* [[V_VOID_ADDR]], i8** [[OMP_CGROUP_MEM_ALLOC]]) +// CHECK-NOT: {{__kmpc_alloc|__kmpc_free}} +// CHECK: ret i32 [[V_VAL]] + +// CHECK-NOT: call {{.+}} {{__kmpc_alloc|__kmpc_free}} +extern template int ST::m; +#endif -- 2.7.4