From fac26cf4ca4abd9142a4ed8939c167a6856f656e Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 2 May 2018 20:03:27 +0000 Subject: [PATCH] [OPENMP] Add support for reductions on simd directives in target regions. Added codegen for `simd reduction()` constructs in target directives. llvm-svn: 331393 --- clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 58 ++++++++++++++++++++----- clang/test/OpenMP/nvptx_target_simd_codegen.cpp | 34 +++++++++++---- 2 files changed, 73 insertions(+), 19 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 22e1e48..82be31f 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -61,6 +61,12 @@ enum OpenMPRTLFunctionNVPTX { /// lane_offset, int16_t shortCircuit), /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num)); OMPRTL_NVPTX__kmpc_parallel_reduce_nowait, + /// \brief Call to __kmpc_nvptx_simd_reduce_nowait(kmp_int32 + /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data, + /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t + /// lane_offset, int16_t shortCircuit), + /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num)); + OMPRTL_NVPTX__kmpc_simd_reduce_nowait, /// \brief Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, /// int32_t num_vars, size_t reduce_size, void *reduce_data, /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t @@ -1028,6 +1034,33 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait"); break; } + case OMPRTL_NVPTX__kmpc_simd_reduce_nowait: { + // Build int32_t kmpc_nvptx_simd_reduce_nowait(kmp_int32 global_tid, + // kmp_int32 num_vars, size_t reduce_size, void* reduce_data, + // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t + // lane_offset, int16_t Algorithm Version), + // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num)); + llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty, + CGM.Int16Ty, CGM.Int16Ty}; + auto *ShuffleReduceFnTy = + llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams, + /*isVarArg=*/false); + llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty}; + auto *InterWarpCopyFnTy = + llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams, + /*isVarArg=*/false); + llvm::Type *TypeParams[] = {CGM.Int32Ty, + CGM.Int32Ty, + CGM.SizeTy, + CGM.VoidPtrTy, + ShuffleReduceFnTy->getPointerTo(), + InterWarpCopyFnTy->getPointerTo()}; + auto *FnTy = + llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); + RTLFn = CGM.CreateRuntimeFunction( + FnTy, /*Name=*/"__kmpc_nvptx_simd_reduce_nowait"); + break; + } case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: { // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, // int32_t num_vars, size_t reduce_size, void *reduce_data, @@ -2703,8 +2736,8 @@ void CGOpenMPRuntimeNVPTX::emitReduction( bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind); - // FIXME: Add support for simd reduction. - assert((TeamsReduction || ParallelReduction) && + bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind); + assert((TeamsReduction || ParallelReduction || SimdReduction) && "Invalid reduction selection in emitReduction."); ASTContext &C = CGM.getContext(); @@ -2764,19 +2797,22 @@ void CGOpenMPRuntimeNVPTX::emitReduction( llvm::Value *InterWarpCopyFn = emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc); - llvm::Value *Res = nullptr; - if (ParallelReduction) { - llvm::Value *Args[] = {ThreadId, - CGF.Builder.getInt32(RHSExprs.size()), - ReductionArrayTySize, - RL, - ShuffleAndReduceFn, - InterWarpCopyFn}; + llvm::Value *Args[] = {ThreadId, + CGF.Builder.getInt32(RHSExprs.size()), + ReductionArrayTySize, + RL, + ShuffleAndReduceFn, + InterWarpCopyFn}; + llvm::Value *Res = nullptr; + if (ParallelReduction) Res = CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait), Args); - } + else if (SimdReduction) + Res = CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_simd_reduce_nowait), + Args); if (TeamsReduction) { llvm::Value *ScratchPadCopyFn = diff --git a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp index 9bb7617..001eb68 100644 --- a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp @@ -9,9 +9,10 @@ #define HEADER // Check that the execution mode of all 2 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l24}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l29}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l34}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l25}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l30}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l35}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 0 #define N 1000 @@ -20,14 +21,14 @@ tx ftemplate(int n) { tx a[N]; short aa[N]; tx b[10]; - + #pragma omp target simd for(int i = 0; i < n; i++) { a[i] = 1; } #pragma omp target simd - for(int i = 0; i < n; i++) { + for (int i = 0; i < n; i++) { aa[i] += 1; } @@ -36,6 +37,11 @@ tx ftemplate(int n) { b[i] += 1; } + #pragma omp target simd reduction(+:n) + for(int i = 0; i < 10; i++) { + b[i] += 1; + } + return a[0]; } @@ -47,7 +53,7 @@ int bar(int n){ return a; } -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l24}}( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l25}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], // CHECK-NOT: call void @__kmpc_for_static_init @@ -55,7 +61,7 @@ int bar(int n){ // CHECK: call void @__kmpc_spmd_kernel_deinit() // CHECK: ret void -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l29}}( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l30}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], // CHECK-NOT: call void @__kmpc_for_static_init @@ -63,7 +69,7 @@ int bar(int n){ // CHECK: call void @__kmpc_spmd_kernel_deinit() // CHECK: ret void -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l34}}( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l35}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], // CHECK-NOT: call void @__kmpc_for_static_init @@ -71,4 +77,16 @@ int bar(int n){ // CHECK: call void @__kmpc_spmd_kernel_deinit() // CHECK: ret void +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l40}}( +// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], +// CHECK-NOT: call void @__kmpc_for_static_init +// CHECK-NOT: call void @__kmpc_for_static_fini +// CHECK: [[RES:%.+]] = call i32 @__kmpc_nvptx_simd_reduce_nowait(i32 %{{.+}}, i32 1, i{{64|32}} {{8|4}}, i8* %{{.+}}, void (i8*, i16, i16, i16)* @{{.+}}, void (i8*, i32)* @{{.+}}) +// CHECK: switch i32 [[RES]] +// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(i32 %{{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_deinit() +// CHECK: ret void + + #endif -- 2.7.4