From fac26cf4ca4abd9142a4ed8939c167a6856f656e Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Wed, 2 May 2018 20:03:27 +0000
Subject: [PATCH] [OPENMP] Add support for reductions on simd directives in
 target regions.

Added codegen for `simd reduction()` constructs in target directives.

llvm-svn: 331393
---
 clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp      | 58 ++++++++++++++++++++-----
 clang/test/OpenMP/nvptx_target_simd_codegen.cpp | 34 +++++++++++----
 2 files changed, 73 insertions(+), 19 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index 22e1e48..82be31f 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -61,6 +61,12 @@ enum OpenMPRTLFunctionNVPTX {
   /// lane_offset, int16_t shortCircuit),
   /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
   OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
+  /// \brief Call to __kmpc_nvptx_simd_reduce_nowait(kmp_int32
+  /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
+  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
+  /// lane_offset, int16_t shortCircuit),
+  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
+  OMPRTL_NVPTX__kmpc_simd_reduce_nowait,
   /// \brief Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
   /// int32_t num_vars, size_t reduce_size, void *reduce_data,
   /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t
@@ -1028,6 +1034,33 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
         FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
     break;
   }
+  case OMPRTL_NVPTX__kmpc_simd_reduce_nowait: {
+    // Build int32_t kmpc_nvptx_simd_reduce_nowait(kmp_int32 global_tid,
+    // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
+    // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
+    // lane_offset, int16_t Algorithm Version),
+    // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
+    llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
+                                             CGM.Int16Ty, CGM.Int16Ty};
+    auto *ShuffleReduceFnTy =
+        llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
+                                /*isVarArg=*/false);
+    llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
+    auto *InterWarpCopyFnTy =
+        llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
+                                /*isVarArg=*/false);
+    llvm::Type *TypeParams[] = {CGM.Int32Ty,
+                                CGM.Int32Ty,
+                                CGM.SizeTy,
+                                CGM.VoidPtrTy,
+                                ShuffleReduceFnTy->getPointerTo(),
+                                InterWarpCopyFnTy->getPointerTo()};
+    auto *FnTy =
+        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
+    RTLFn = CGM.CreateRuntimeFunction(
+        FnTy, /*Name=*/"__kmpc_nvptx_simd_reduce_nowait");
+    break;
+  }
   case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
     // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
     // int32_t num_vars, size_t reduce_size, void *reduce_data,
@@ -2703,8 +2736,8 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
 
   bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
   bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
-  // FIXME: Add support for simd reduction.
-  assert((TeamsReduction || ParallelReduction) &&
+  bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind);
+  assert((TeamsReduction || ParallelReduction || SimdReduction) &&
          "Invalid reduction selection in emitReduction.");
 
   ASTContext &C = CGM.getContext();
@@ -2764,19 +2797,22 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
   llvm::Value *InterWarpCopyFn =
       emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
 
-  llvm::Value *Res = nullptr;
-  if (ParallelReduction) {
-    llvm::Value *Args[] = {ThreadId,
-                           CGF.Builder.getInt32(RHSExprs.size()),
-                           ReductionArrayTySize,
-                           RL,
-                           ShuffleAndReduceFn,
-                           InterWarpCopyFn};
+  llvm::Value *Args[] = {ThreadId,
+                         CGF.Builder.getInt32(RHSExprs.size()),
+                         ReductionArrayTySize,
+                         RL,
+                         ShuffleAndReduceFn,
+                         InterWarpCopyFn};
 
+  llvm::Value *Res = nullptr;
+  if (ParallelReduction)
     Res = CGF.EmitRuntimeCall(
         createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
         Args);
-  }
+  else if (SimdReduction)
+    Res = CGF.EmitRuntimeCall(
+        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_simd_reduce_nowait),
+        Args);
 
   if (TeamsReduction) {
     llvm::Value *ScratchPadCopyFn =
diff --git a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
index 9bb7617..001eb68 100644
--- a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
@@ -9,9 +9,10 @@
 #define HEADER
 
 // Check that the execution mode of all 2 target regions on the gpu is set to SPMD Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l24}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l29}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l34}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l25}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l30}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l35}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 0
 
 #define N 1000
 
@@ -20,14 +21,14 @@ tx ftemplate(int n) {
   tx a[N];
   short aa[N];
   tx b[10];
-  
+
   #pragma omp target simd
   for(int i = 0; i < n; i++) {
     a[i] = 1;
   }
 
   #pragma omp target simd
-  for(int i = 0; i < n; i++) {  
+  for (int i = 0; i < n; i++) {
     aa[i] += 1;
   }
 
@@ -36,6 +37,11 @@ tx ftemplate(int n) {
     b[i] += 1;
   }
 
+  #pragma omp target simd reduction(+:n)
+  for(int i = 0; i < 10; i++) {
+    b[i] += 1;
+  }
+
   return a[0];
 }
 
@@ -47,7 +53,7 @@ int bar(int n){
   return a;
 }
 
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l24}}(
+// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l25}}(
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]],
 // CHECK-NOT: call void @__kmpc_for_static_init
@@ -55,7 +61,7 @@ int bar(int n){
 // CHECK: call void @__kmpc_spmd_kernel_deinit()
 // CHECK: ret void
 
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l29}}(
+// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l30}}(
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]],
 // CHECK-NOT: call void @__kmpc_for_static_init
@@ -63,7 +69,7 @@ int bar(int n){
 // CHECK: call void @__kmpc_spmd_kernel_deinit()
 // CHECK: ret void
 
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l34}}(
+// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l35}}(
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]],
 // CHECK-NOT: call void @__kmpc_for_static_init
@@ -71,4 +77,16 @@ int bar(int n){
 // CHECK: call void @__kmpc_spmd_kernel_deinit()
 // CHECK: ret void
 
+// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l40}}(
+// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]],
+// CHECK-NOT: call void @__kmpc_for_static_init
+// CHECK-NOT: call void @__kmpc_for_static_fini
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_nvptx_simd_reduce_nowait(i32 %{{.+}}, i32 1, i{{64|32}} {{8|4}}, i8* %{{.+}}, void (i8*, i16, i16, i16)* @{{.+}}, void (i8*, i32)* @{{.+}})
+// CHECK: switch i32 [[RES]]
+// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(i32 %{{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_deinit()
+// CHECK: ret void
+
+
 #endif
-- 
2.7.4