From 4ac58d1a4b5969ea9067755154b83238463eab76 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Fri, 12 Oct 2018 20:19:59 +0000
Subject: [PATCH] [OPENMP][NVPTX]Reduce memory usage in target region.

Additional reduction of the global memory usage in the target regions
without parallel regions.

llvm-svn: 344413
---
 clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp   | 29 ++++++++++++++++------------
 clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h     |  3 +++
 clang/test/OpenMP/nvptx_data_sharing.cpp     | 12 +++---------
 clang/test/OpenMP/nvptx_parallel_codegen.cpp |  7 ++-----
 clang/test/OpenMP/nvptx_teams_codegen.cpp    | 28 ++++++++-------------------
 5 files changed, 33 insertions(+), 46 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index 7ae8377..4d5a580 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -371,11 +371,11 @@ class CheckVarsEscapingDeclContext final
     }
   }
 
-  void buildRecordForGlobalizedVars(bool IsInTargetMasterThreadRegion) {
+  void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
     assert(!GlobalizedRD &&
            "Record for globalized variables is built already.");
     ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
-    if (IsInTargetMasterThreadRegion)
+    if (IsInTTDRegion)
       EscapedDeclsForTeams = EscapedDecls.getArrayRef();
     else
       EscapedDeclsForParallel = EscapedDecls.getArrayRef();
@@ -527,9 +527,9 @@ public:
 
   /// Returns the record that handles all the escaped local variables and used
   /// instead of their original storage.
-  const RecordDecl *getGlobalizedRecord(bool IsInTargetMasterThreadRegion) {
+  const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
     if (!GlobalizedRD)
-      buildRecordForGlobalizedVars(IsInTargetMasterThreadRegion);
+      buildRecordForGlobalizedVars(IsInTTDRegion);
     return GlobalizedRD;
   }
 
@@ -1132,8 +1132,10 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
     }
   } Action(EST, WST);
   CodeGen.setAction(Action);
+  IsInTTDRegion = true;
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                    IsOffloadEntry, CodeGen);
+  IsInTTDRegion = false;
 
   // Now change the name of the worker function to correspond to this target
   // region's entry function.
@@ -1246,8 +1248,10 @@ void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
     }
   } Action(*this, EST, D);
   CodeGen.setAction(Action);
+  IsInTTDRegion = true;
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                    IsOffloadEntry, CodeGen);
+  IsInTTDRegion = false;
 }
 
 static void
@@ -1851,12 +1855,15 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
     }
   } Action(IsInParallelRegion);
   CodeGen.setAction(Action);
+  bool PrevIsInTTDRegion = IsInTTDRegion;
+  IsInTTDRegion = false;
   bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
   IsInTargetMasterThreadRegion = false;
   auto *OutlinedFun =
       cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
           D, ThreadIDVar, InnermostKind, CodeGen));
   IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
+  IsInTTDRegion = PrevIsInTTDRegion;
   if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
       !IsInParallelRegion) {
     llvm::Function *WrapperFun =
@@ -4142,7 +4149,7 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
   CheckVarsEscapingDeclContext VarChecker(CGF);
   VarChecker.Visit(Body);
   const RecordDecl *GlobalizedVarsRecord =
-      VarChecker.getGlobalizedRecord(IsInTargetMasterThreadRegion);
+      VarChecker.getGlobalizedRecord(IsInTTDRegion);
   ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
       VarChecker.getEscapedVariableLengthDecls();
   if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
@@ -4160,22 +4167,20 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
   for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
     assert(VD->isCanonicalDecl() && "Expected canonical declaration");
     const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
-    Data.insert(
-        std::make_pair(VD, MappedVarData(FD, IsInTargetMasterThreadRegion)));
+    Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
   }
-  if (!IsInTargetMasterThreadRegion && !NeedToDelayGlobalization &&
-      !IsInParallelRegion) {
+  if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
     CheckVarsEscapingDeclContext VarChecker(CGF);
     VarChecker.Visit(Body);
     I->getSecond().SecondaryGlobalRecord =
-        VarChecker.getGlobalizedRecord(/*IsInTargetMasterThreadRegion=*/true);
+        VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
     I->getSecond().SecondaryLocalVarData.emplace();
     DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
     for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
       assert(VD->isCanonicalDecl() && "Expected canonical declaration");
       const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
-      Data.insert(std::make_pair(
-          VD, MappedVarData(FD, /*IsInTargetMasterThreadRegion=*/true)));
+      Data.insert(
+          std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
     }
   }
   if (!NeedToDelayGlobalization) {
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
index 53ef13b..d40e6cf 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -360,6 +360,9 @@ private:
   /// true if we're emitting the code for the target region and next parallel
   /// region is L0 for sure.
   bool IsInTargetMasterThreadRegion = false;
+  /// true if currently emitting code for target/teams/distribute region, false
+  /// - otherwise.
+  bool IsInTTDRegion = false;
   /// true if we're definitely in the parallel region.
   bool IsInParallelRegion = false;
 
diff --git a/clang/test/OpenMP/nvptx_data_sharing.cpp b/clang/test/OpenMP/nvptx_data_sharing.cpp
index 8320676..0acb119 100644
--- a/clang/test/OpenMP/nvptx_data_sharing.cpp
+++ b/clang/test/OpenMP/nvptx_data_sharing.cpp
@@ -39,16 +39,10 @@ void test_ds(){
 // CK1: [[SHAREDARGS2:%.+]] = alloca i8**
 // CK1: call void @__kmpc_kernel_init
 // CK1: call void @__kmpc_data_sharing_init_stack
-// CK1: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 256, i16 0)
+// CK1: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 0)
 // CK1: [[GLOBALSTACK2:%.+]] = bitcast i8* [[GLOBALSTACK]] to %struct._globalized_locals_ty*
-// CK1: [[A_ARR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0
-// CK1: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CK1: [[LID:%.+]] = and i32 [[TID]], 31
-// CK1: [[A:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[A_ARR]], i32 0, i32 [[LID]]
-// CK1: [[B_ARR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1
-// CK1: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CK1: [[LID:%.+]] = and i32 [[TID]], 31
-// CK1: [[B:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[B_ARR]], i32 0, i32 [[LID]]
+// CK1: [[A:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0
+// CK1: [[B:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1
 // CK1: store i32 10, i32* [[A]]
 // CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i16 1)
 // CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS1]], i64 1)
diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
index 978804d..d1a3104 100644
--- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
@@ -318,14 +318,11 @@ int bar(int n){
 // CHECK-32: [[A_ADDR:%.+]] = alloca i32,
 // CHECK-64: [[A_ADDR:%.+]] = alloca i64,
 // CHECK-64: [[CONV:%.+]] = bitcast i64* [[A_ADDR]] to i32*
-// CHECK: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 128, i16 0)
+// CHECK: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 4, i16 0)
 // CHECK: [[BC:%.+]] = bitcast i8* [[STACK]] to %struct._globalized_locals_ty*
 // CHECK-32: [[A:%.+]] = load i32, i32* [[A_ADDR]],
 // CHECK-64: [[A:%.+]] = load i32, i32* [[CONV]],
-// CHECK: [[GLOBAL_A_ADDR_ARR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CHECK: [[LID:%.+]] = and i32 [[TID]], 31
-// CHECK: [[GLOBAL_A_ADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[GLOBAL_A_ADDR_ARR]], i32 0, i32 [[LID]]
+// CHECK: [[GLOBAL_A_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: store i32 [[A]], i32* [[GLOBAL_A_ADDR]],
 
 // CHECK-LABEL: define internal void @{{.+}}(i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.*}})
diff --git a/clang/test/OpenMP/nvptx_teams_codegen.cpp b/clang/test/OpenMP/nvptx_teams_codegen.cpp
index 91b372c..4e3f267 100644
--- a/clang/test/OpenMP/nvptx_teams_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_codegen.cpp
@@ -36,13 +36,10 @@ int main (int argc, char **argv) {
 // CK1:  store {{.+}} 0, {{.+}},
 // CK1:  store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]],
 // CK1-64:  [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}*
-// CK1:  call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 128, i16 0)
+// CK1:  call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 4, i16 0)
 // CK1-64:  [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]]
 // CK1-32:  [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]]
-// CK1:  [[ARGCADDR_ARR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CK1:  [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CK1:  [[LID:%.+]] = and i32 [[TID]], 31
-// CK1:  [[ARGCADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[ARGCADDR_ARR]], i32 0, i32 [[LID]]
+// CK1:  [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CK1:  store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]],
 // CK1:  store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
 // CK1:  [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]],
@@ -56,12 +53,9 @@ int main (int argc, char **argv) {
 // CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{.+}}***,
 // CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}**,
 // CK1: store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]]
-// CK1: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{128|256}}, i16 0)
+// CK1: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{4|8}}, i16 0)
 // CK1: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]]
-// CK1: [[ARGCADDR_ARR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CK1: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CK1: [[LID:%.+]] = and i32 [[TID]], 31
-// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds [32 x i8**], [32 x i8**]* [[ARGCADDR_ARR]], i32 0, i32 [[LID]]
+// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CK1: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]],
 // CK1: store i8*** [[ARGCADDR]], i8**** [[ARGCADDR_PTR]],
 // CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{.+}}**, i{{.+}}*** [[ARGCADDR_PTR]],
@@ -117,13 +111,10 @@ int main (int argc, char **argv) {
 // CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32*
 // CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32*
 // CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32*
-// CK2:  call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 128, i16 0)
+// CK2:  call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 4, i16 0)
 // CK2-64:  [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]]
 // CK2-32:  [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]]
-// CK2:  [[ARGCADDR_ARR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CK2:  [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CK2:  [[LID:%.+]] = and i32 [[TID]], 31
-// CK2:  [[ARGCADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[ARGCADDR_ARR]], i32 0, i32 [[LID]]
+// CK2:  [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CK2:  store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]],
 // CK2:  {{%.+}} = call i32 @__kmpc_global_thread_num(
 // CK2:  store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
@@ -141,12 +132,9 @@ int main (int argc, char **argv) {
 // CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]],
 // CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]],
 // CK2: store i{{[0-9]+}}** [[ARGC]], i{{[0-9]+}}*** [[ARGCADDR]],
-// CK2: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{128|256}}, i16 0)
+// CK2: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{4|8}}, i16 0)
 // CK2: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]]
-// CK2: [[ARGCADDR_ARR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CK2: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CK2: [[LID:%.+]] = and i32 [[TID]], 31
-// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds [32 x i8**], [32 x i8**]* [[ARGCADDR_ARR]], i32 0, i32 [[LID]]
+// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CK2: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]],
 // CK2: {{%.+}} = call i32 @__kmpc_global_thread_num(
 // CK2: store i{{[0-9]+}}*** [[ARGCADDR]], i{{[0-9]+}}**** [[ARGCADDR_PTR]],
-- 
2.7.4