[OPENMP][NVPTX]Reduce memory usage in target region.

author Alexey Bataev <a.bataev@hotmail.com>

Fri, 12 Oct 2018 20:19:59 +0000 (20:19 +0000)

committer Alexey Bataev <a.bataev@hotmail.com>

Fri, 12 Oct 2018 20:19:59 +0000 (20:19 +0000)
author Alexey Bataev <a.bataev@hotmail.com>
Fri, 12 Oct 2018 20:19:59 +0000 (20:19 +0000)
committer Alexey Bataev <a.bataev@hotmail.com>
Fri, 12 Oct 2018 20:19:59 +0000 (20:19 +0000)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp

index 7ae8377..4d5a580 100644 (file)
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -371,11 +371,11 @@ class CheckVarsEscapingDeclContext final
      }
    }
  
-  void buildRecordForGlobalizedVars(bool IsInTargetMasterThreadRegion) {
+  void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
      assert(!GlobalizedRD &&
             "Record for globalized variables is built already.");
      ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
-    if (IsInTargetMasterThreadRegion)
+    if (IsInTTDRegion)
        EscapedDeclsForTeams = EscapedDecls.getArrayRef();
      else
        EscapedDeclsForParallel = EscapedDecls.getArrayRef();
@@ -527,9 +527,9 @@ public:
  
    /// Returns the record that handles all the escaped local variables and used
    /// instead of their original storage.
-  const RecordDecl *getGlobalizedRecord(bool IsInTargetMasterThreadRegion) {
+  const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
      if (!GlobalizedRD)
-      buildRecordForGlobalizedVars(IsInTargetMasterThreadRegion);
+      buildRecordForGlobalizedVars(IsInTTDRegion);
      return GlobalizedRD;
    }
  
@@ -1132,8 +1132,10 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
      }
    } Action(EST, WST);
    CodeGen.setAction(Action);
+  IsInTTDRegion = true;
    emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                     IsOffloadEntry, CodeGen);
+  IsInTTDRegion = false;
  
    // Now change the name of the worker function to correspond to this target
    // region's entry function.
@@ -1246,8 +1248,10 @@ void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
      }
    } Action(*this, EST, D);
    CodeGen.setAction(Action);
+  IsInTTDRegion = true;
    emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                     IsOffloadEntry, CodeGen);
+  IsInTTDRegion = false;
  }
  
  static void
@@ -1851,12 +1855,15 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
      }
    } Action(IsInParallelRegion);
    CodeGen.setAction(Action);
+  bool PrevIsInTTDRegion = IsInTTDRegion;
+  IsInTTDRegion = false;
    bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
    IsInTargetMasterThreadRegion = false;
    auto *OutlinedFun =
        cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
            D, ThreadIDVar, InnermostKind, CodeGen));
    IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
+  IsInTTDRegion = PrevIsInTTDRegion;
    if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
        !IsInParallelRegion) {
      llvm::Function *WrapperFun =
@@ -4142,7 +4149,7 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
    CheckVarsEscapingDeclContext VarChecker(CGF);
    VarChecker.Visit(Body);
    const RecordDecl *GlobalizedVarsRecord =
-      VarChecker.getGlobalizedRecord(IsInTargetMasterThreadRegion);
+      VarChecker.getGlobalizedRecord(IsInTTDRegion);
    ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
        VarChecker.getEscapedVariableLengthDecls();
    if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
@@ -4160,22 +4167,20 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
    for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
      assert(VD->isCanonicalDecl() && "Expected canonical declaration");
      const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
-    Data.insert(
-        std::make_pair(VD, MappedVarData(FD, IsInTargetMasterThreadRegion)));
+    Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
    }
-  if (!IsInTargetMasterThreadRegion && !NeedToDelayGlobalization &&
-      !IsInParallelRegion) {
+  if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
      CheckVarsEscapingDeclContext VarChecker(CGF);
      VarChecker.Visit(Body);
      I->getSecond().SecondaryGlobalRecord =
-        VarChecker.getGlobalizedRecord(/*IsInTargetMasterThreadRegion=*/true);
+        VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
      I->getSecond().SecondaryLocalVarData.emplace();
      DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
      for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
        assert(VD->isCanonicalDecl() && "Expected canonical declaration");
        const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
-      Data.insert(std::make_pair(
-          VD, MappedVarData(FD, /*IsInTargetMasterThreadRegion=*/true)));
+      Data.insert(
+          std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
      }
    }
    if (!NeedToDelayGlobalization) {
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h

index 53ef13b..d40e6cf 100644 (file)
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -360,6 +360,9 @@ private:
    /// true if we're emitting the code for the target region and next parallel
    /// region is L0 for sure.
    bool IsInTargetMasterThreadRegion = false;
+  /// true if currently emitting code for target/teams/distribute region, false
+  /// - otherwise.
+  bool IsInTTDRegion = false;
    /// true if we're definitely in the parallel region.
    bool IsInParallelRegion = false;
  
diff --git a/clang/test/OpenMP/nvptx_data_sharing.cpp b/clang/test/OpenMP/nvptx_data_sharing.cpp

index 8320676..0acb119 100644 (file)
--- a/clang/test/OpenMP/nvptx_data_sharing.cpp
+++ b/clang/test/OpenMP/nvptx_data_sharing.cpp
@@ -39,16 +39,10 @@ void test_ds(){
  // CK1: [[SHAREDARGS2:%.+]] = alloca i8**
  // CK1: call void @__kmpc_kernel_init
  // CK1: call void @__kmpc_data_sharing_init_stack
-// CK1: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 256, i16 0)
+// CK1: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 0)
  // CK1: [[GLOBALSTACK2:%.+]] = bitcast i8* [[GLOBALSTACK]] to %struct._globalized_locals_ty*
-// CK1: [[A_ARR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0
-// CK1: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CK1: [[LID:%.+]] = and i32 [[TID]], 31
-// CK1: [[A:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[A_ARR]], i32 0, i32 [[LID]]
-// CK1: [[B_ARR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1
-// CK1: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CK1: [[LID:%.+]] = and i32 [[TID]], 31
-// CK1: [[B:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[B_ARR]], i32 0, i32 [[LID]]
+// CK1: [[A:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0
+// CK1: [[B:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1
  // CK1: store i32 10, i32* [[A]]
  // CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i16 1)
  // CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS1]], i64 1)
diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp

index 978804d..d1a3104 100644 (file)
--- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
@@ -318,14 +318,11 @@ int bar(int n){
  // CHECK-32: [[A_ADDR:%.+]] = alloca i32,
  // CHECK-64: [[A_ADDR:%.+]] = alloca i64,
  // CHECK-64: [[CONV:%.+]] = bitcast i64* [[A_ADDR]] to i32*
-// CHECK: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 128, i16 0)
+// CHECK: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 4, i16 0)
  // CHECK: [[BC:%.+]] = bitcast i8* [[STACK]] to %struct._globalized_locals_ty*
  // CHECK-32: [[A:%.+]] = load i32, i32* [[A_ADDR]],
  // CHECK-64: [[A:%.+]] = load i32, i32* [[CONV]],
-// CHECK: [[GLOBAL_A_ADDR_ARR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CHECK: [[LID:%.+]] = and i32 [[TID]], 31
-// CHECK: [[GLOBAL_A_ADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[GLOBAL_A_ADDR_ARR]], i32 0, i32 [[LID]]
+// CHECK: [[GLOBAL_A_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
  // CHECK: store i32 [[A]], i32* [[GLOBAL_A_ADDR]],
  
  // CHECK-LABEL: define internal void @{{.+}}(i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.*}})
diff --git a/clang/test/OpenMP/nvptx_teams_codegen.cpp b/clang/test/OpenMP/nvptx_teams_codegen.cpp

index 91b372c..4e3f267 100644 (file)
--- a/clang/test/OpenMP/nvptx_teams_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_codegen.cpp
@@ -36,13 +36,10 @@ int main (int argc, char **argv) {
  // CK1:  store {{.+}} 0, {{.+}},
  // CK1:  store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]],
  // CK1-64:  [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}*
-// CK1:  call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 128, i16 0)
+// CK1:  call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 4, i16 0)
  // CK1-64:  [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]]
  // CK1-32:  [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]]
-// CK1:  [[ARGCADDR_ARR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CK1:  [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CK1:  [[LID:%.+]] = and i32 [[TID]], 31
-// CK1:  [[ARGCADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[ARGCADDR_ARR]], i32 0, i32 [[LID]]
+// CK1:  [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
  // CK1:  store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]],
  // CK1:  store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
  // CK1:  [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]],
@@ -56,12 +53,9 @@ int main (int argc, char **argv) {
  // CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{.+}}***,
  // CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}**,
  // CK1: store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]]
-// CK1: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{128|256}}, i16 0)
+// CK1: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{4|8}}, i16 0)
  // CK1: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]]
-// CK1: [[ARGCADDR_ARR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CK1: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CK1: [[LID:%.+]] = and i32 [[TID]], 31
-// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds [32 x i8**], [32 x i8**]* [[ARGCADDR_ARR]], i32 0, i32 [[LID]]
+// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
  // CK1: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]],
  // CK1: store i8*** [[ARGCADDR]], i8**** [[ARGCADDR_PTR]],
  // CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{.+}}**, i{{.+}}*** [[ARGCADDR_PTR]],
@@ -117,13 +111,10 @@ int main (int argc, char **argv) {
  // CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32*
  // CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32*
  // CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32*
-// CK2:  call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 128, i16 0)
+// CK2:  call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 4, i16 0)
  // CK2-64:  [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]]
  // CK2-32:  [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]]
-// CK2:  [[ARGCADDR_ARR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CK2:  [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CK2:  [[LID:%.+]] = and i32 [[TID]], 31
-// CK2:  [[ARGCADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[ARGCADDR_ARR]], i32 0, i32 [[LID]]
+// CK2:  [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
  // CK2:  store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]],
  // CK2:  {{%.+}} = call i32 @__kmpc_global_thread_num(
  // CK2:  store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
@@ -141,12 +132,9 @@ int main (int argc, char **argv) {
  // CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]],
  // CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]],
  // CK2: store i{{[0-9]+}}** [[ARGC]], i{{[0-9]+}}*** [[ARGCADDR]],
-// CK2: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{128|256}}, i16 0)
+// CK2: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{4|8}}, i16 0)
  // CK2: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]]
-// CK2: [[ARGCADDR_ARR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CK2: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CK2: [[LID:%.+]] = and i32 [[TID]], 31
-// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds [32 x i8**], [32 x i8**]* [[ARGCADDR_ARR]], i32 0, i32 [[LID]]
+// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
  // CK2: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]],
  // CK2: {{%.+}} = call i32 @__kmpc_global_thread_num(
  // CK2: store i{{[0-9]+}}*** [[ARGCADDR]], i{{[0-9]+}}**** [[ARGCADDR_PTR]],
author	Alexey Bataev <a.bataev@hotmail.com>
	Fri, 12 Oct 2018 20:19:59 +0000 (20:19 +0000)
committer	Alexey Bataev <a.bataev@hotmail.com>
	Fri, 12 Oct 2018 20:19:59 +0000 (20:19 +0000)
clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp		patch \| blob \| history
clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h		patch \| blob \| history
clang/test/OpenMP/nvptx_data_sharing.cpp		patch \| blob \| history
clang/test/OpenMP/nvptx_parallel_codegen.cpp		patch \| blob \| history
clang/test/OpenMP/nvptx_teams_codegen.cpp		patch \| blob \| history