From 07766f4070301072840b92c02c215391c7b5a870 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Fri, 1 Jul 2022 14:35:10 -0500
Subject: [PATCH] [Attributor] Move heap2stack allocas to the entry block if
 possible

If we are certainly not in a loop we can directly emit the heap2stack
allocas in the function entry block. This will help to get rid of them
(SROA) and avoid stacksave/restore intrinsics when the function is
inlined.
---
 llvm/lib/Transforms/IPO/AttributorAttributes.cpp   | 18 +++++++++++++-
 .../Transforms/Attributor/heap_to_stack_gpu.ll     | 29 ++++++++++++++++++++++
 .../Attributor/value-simplify-pointer-info.ll      | 10 ++++----
 llvm/test/Transforms/OpenMP/spmdization.ll         |  8 +++---
 4 files changed, 55 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index ead2666..4d99ce7 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Assumptions.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -6241,6 +6242,17 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
     Function *F = getAnchorScope();
     const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
 
+    LoopInfo *LI =
+        A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>(*F);
+    Optional<bool> MayContainIrreducibleControl;
+    auto IsInLoop = [&](BasicBlock &BB) {
+      if (!MayContainIrreducibleControl.has_value())
+        MayContainIrreducibleControl = mayContainIrreducibleControl(*F, LI);
+      if (MayContainIrreducibleControl.value())
+        return true;
+      return LI->getLoopFor(&BB) != nullptr;
+    };
+
     for (auto &It : AllocationInfos) {
       AllocationInfo &AI = *It.second;
       if (AI.Status == AllocationInfo::INVALID)
@@ -6282,6 +6294,10 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
         Size = SizeOffsetPair.first;
       }
 
+      Instruction *IP = (!SizeAPI.has_value() || IsInLoop(*AI.CB->getParent()))
+                            ? AI.CB
+                            : &F->getEntryBlock().front();
+
       Align Alignment(1);
       if (MaybeAlign RetAlign = AI.CB->getRetAlign())
         Alignment = std::max(Alignment, *RetAlign);
@@ -6296,7 +6312,7 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
       // TODO: Hoist the alloca towards the function entry.
       unsigned AS = DL.getAllocaAddrSpace();
       Instruction *Alloca = new AllocaInst(Type::getInt8Ty(F->getContext()), AS,
-                                           Size, Alignment, "", AI.CB);
+                                           Size, Alignment, "", IP);
 
       if (Alloca->getType() != AI.CB->getType())
         Alloca = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
diff --git a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
index ea94ae7..4851806 100644
--- a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
@@ -704,6 +704,35 @@ define void @test17b() {
   ret void
 }
 
+define void @move_alloca() {
+; IS________OPM-LABEL: define {{[^@]+}}@move_alloca() {
+; IS________OPM-NEXT:  entry:
+; IS________OPM-NEXT:    br label [[NOT_ENTRY:%.*]]
+; IS________OPM:       not_entry:
+; IS________OPM-NEXT:    [[TMP0:%.*]] = tail call noalias i8* @__kmpc_alloc_shared(i64 noundef 4)
+; IS________OPM-NEXT:    tail call void @usei8(i8* noalias nocapture nofree [[TMP0]]) #[[ATTR6]]
+; IS________OPM-NEXT:    tail call void @__kmpc_free_shared(i8* noalias nocapture [[TMP0]], i64 noundef 4)
+; IS________OPM-NEXT:    ret void
+;
+; IS________NPM-LABEL: define {{[^@]+}}@move_alloca() {
+; IS________NPM-NEXT:  entry:
+; IS________NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1, addrspace(5)
+; IS________NPM-NEXT:    br label [[NOT_ENTRY:%.*]]
+; IS________NPM:       not_entry:
+; IS________NPM-NEXT:    [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP0]] to i8*
+; IS________NPM-NEXT:    tail call void @usei8(i8* noalias nocapture nofree [[MALLOC_CAST]]) #[[ATTR6]]
+; IS________NPM-NEXT:    ret void
+;
+entry:
+  br label %not_entry
+
+not_entry:
+  %0 = tail call noalias i8* @__kmpc_alloc_shared(i64 4)
+  tail call void @usei8(i8* nocapture nofree %0) willreturn nounwind nosync
+  tail call void @__kmpc_free_shared(i8* %0, i64 4)
+  ret void
+}
+
 
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind willreturn }
diff --git a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
index ae3c60f..401bbc0 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
@@ -3978,16 +3978,16 @@ define dso_local void @test_nested_memory(float* %dst, double* %src) {
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@test_nested_memory
 ; IS__TUNIT_NPM-SAME: (float* nocapture nofree writeonly [[DST:%.*]], double* nocapture nofree readonly [[SRC:%.*]]) {
 ; IS__TUNIT_NPM-NEXT:  entry:
+; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 24, align 1
 ; IS__TUNIT_NPM-NEXT:    [[LOCAL:%.*]] = alloca [[STRUCT_STY:%.*]], align 8
-; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = bitcast %struct.STy* [[LOCAL]] to i8*
+; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = bitcast %struct.STy* [[LOCAL]] to i8*
 ; IS__TUNIT_NPM-NEXT:    [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[LOCAL]], i64 0, i32 2
-; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = alloca i8, i64 24, align 1
-; IS__TUNIT_NPM-NEXT:    [[DST1:%.*]] = bitcast i8* [[TMP1]] to float**
+; IS__TUNIT_NPM-NEXT:    [[DST1:%.*]] = bitcast i8* [[TMP0]] to float**
 ; IS__TUNIT_NPM-NEXT:    store float* [[DST]], float** [[DST1]], align 8
-; IS__TUNIT_NPM-NEXT:    [[SRC2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
+; IS__TUNIT_NPM-NEXT:    [[SRC2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8
 ; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SRC2]] to double**
 ; IS__TUNIT_NPM-NEXT:    store double* [[SRC]], double** [[TMP2]], align 8
-; IS__TUNIT_NPM-NEXT:    store i8* [[TMP1]], i8** bitcast (%struct.STy** getelementptr inbounds ([[STRUCT_STY]], %struct.STy* @global, i64 0, i32 2) to i8**), align 8
+; IS__TUNIT_NPM-NEXT:    store i8* [[TMP0]], i8** bitcast (%struct.STy** getelementptr inbounds ([[STRUCT_STY]], %struct.STy* @global, i64 0, i32 2) to i8**), align 8
 ; IS__TUNIT_NPM-NEXT:    call fastcc void @nested_memory_callee() #[[ATTR15:[0-9]+]]
 ; IS__TUNIT_NPM-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
index 0feabb7..9afed08 100644
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -702,8 +702,8 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__2
 ; AMDGPU-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; AMDGPU-NEXT:  entry:
-; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; AMDGPU-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
+; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; AMDGPU-NEXT:    [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP0]] to i8*
 ; AMDGPU-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[MALLOC_CAST]] to i32*
 ; AMDGPU-NEXT:    call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR7]]
@@ -725,8 +725,8 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__2
 ; NVPTX-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; NVPTX-NEXT:  entry:
-; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; NVPTX-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 4
+; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; NVPTX-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
 ; NVPTX-NEXT:    call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR7]]
 ; NVPTX-NEXT:    br label [[FOR_COND:%.*]]
@@ -747,8 +747,8 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias
 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2
 ; AMDGPU-DISABLED-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
+; AMDGPU-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; AMDGPU-DISABLED-NEXT:    [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP0]] to i8*
 ; AMDGPU-DISABLED-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[MALLOC_CAST]] to i32*
 ; AMDGPU-DISABLED-NEXT:    call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR7]]
@@ -770,8 +770,8 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias
 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2
 ; NVPTX-DISABLED-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 4
+; NVPTX-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; NVPTX-DISABLED-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
 ; NVPTX-DISABLED-NEXT:    call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR7]]
 ; NVPTX-DISABLED-NEXT:    br label [[FOR_COND:%.*]]
-- 
2.7.4