From 07766f4070301072840b92c02c215391c7b5a870 Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Fri, 1 Jul 2022 14:35:10 -0500 Subject: [PATCH] [Attributor] Move heap2stack allocas to the entry block if possible If we are certainly not in a loop we can directly emit the heap2stack allocas in the function entry block. This will help to get rid of them (SROA) and avoid stacksave/restore intrinsics when the function is inlined. --- llvm/lib/Transforms/IPO/AttributorAttributes.cpp | 18 +++++++++++++- .../Transforms/Attributor/heap_to_stack_gpu.ll | 29 ++++++++++++++++++++++ .../Attributor/value-simplify-pointer-info.ll | 10 ++++---- llvm/test/Transforms/OpenMP/spmdization.ll | 8 +++--- 4 files changed, 55 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index ead2666..4d99ce7 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -34,6 +34,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Assumptions.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" @@ -6241,6 +6242,17 @@ struct AAHeapToStackFunction final : public AAHeapToStack { Function *F = getAnchorScope(); const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F); + LoopInfo *LI = + A.getInfoCache().getAnalysisResultForFunction(*F); + Optional MayContainIrreducibleControl; + auto IsInLoop = [&](BasicBlock &BB) { + if (!MayContainIrreducibleControl.has_value()) + MayContainIrreducibleControl = mayContainIrreducibleControl(*F, LI); + if (MayContainIrreducibleControl.value()) + return true; + return LI->getLoopFor(&BB) != nullptr; + }; + for (auto &It : AllocationInfos) { AllocationInfo &AI = *It.second; if (AI.Status == AllocationInfo::INVALID) @@ -6282,6 +6294,10 @@ struct AAHeapToStackFunction final : public AAHeapToStack { Size = SizeOffsetPair.first; } + Instruction *IP = (!SizeAPI.has_value() || IsInLoop(*AI.CB->getParent())) + ? AI.CB + : &F->getEntryBlock().front(); + Align Alignment(1); if (MaybeAlign RetAlign = AI.CB->getRetAlign()) Alignment = std::max(Alignment, *RetAlign); @@ -6296,7 +6312,7 @@ struct AAHeapToStackFunction final : public AAHeapToStack { // TODO: Hoist the alloca towards the function entry. unsigned AS = DL.getAllocaAddrSpace(); Instruction *Alloca = new AllocaInst(Type::getInt8Ty(F->getContext()), AS, - Size, Alignment, "", AI.CB); + Size, Alignment, "", IP); if (Alloca->getType() != AI.CB->getType()) Alloca = BitCastInst::CreatePointerBitCastOrAddrSpaceCast( diff --git a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll index ea94ae7..4851806 100644 --- a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll +++ b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll @@ -704,6 +704,35 @@ define void @test17b() { ret void } +define void @move_alloca() { +; IS________OPM-LABEL: define {{[^@]+}}@move_alloca() { +; IS________OPM-NEXT: entry: +; IS________OPM-NEXT: br label [[NOT_ENTRY:%.*]] +; IS________OPM: not_entry: +; IS________OPM-NEXT: [[TMP0:%.*]] = tail call noalias i8* @__kmpc_alloc_shared(i64 noundef 4) +; IS________OPM-NEXT: tail call void @usei8(i8* noalias nocapture nofree [[TMP0]]) #[[ATTR6]] +; IS________OPM-NEXT: tail call void @__kmpc_free_shared(i8* noalias nocapture [[TMP0]], i64 noundef 4) +; IS________OPM-NEXT: ret void +; +; IS________NPM-LABEL: define {{[^@]+}}@move_alloca() { +; IS________NPM-NEXT: entry: +; IS________NPM-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 1, addrspace(5) +; IS________NPM-NEXT: br label [[NOT_ENTRY:%.*]] +; IS________NPM: not_entry: +; IS________NPM-NEXT: [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP0]] to i8* +; IS________NPM-NEXT: tail call void @usei8(i8* noalias nocapture nofree [[MALLOC_CAST]]) #[[ATTR6]] +; IS________NPM-NEXT: ret void +; +entry: + br label %not_entry + +not_entry: + %0 = tail call noalias i8* @__kmpc_alloc_shared(i64 4) + tail call void @usei8(i8* nocapture nofree %0) willreturn nounwind nosync + tail call void @__kmpc_free_shared(i8* %0, i64 4) + ret void +} + ;. ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind willreturn } diff --git a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll index ae3c60f..401bbc0 100644 --- a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll +++ b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll @@ -3978,16 +3978,16 @@ define dso_local void @test_nested_memory(float* %dst, double* %src) { ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@test_nested_memory ; IS__TUNIT_NPM-SAME: (float* nocapture nofree writeonly [[DST:%.*]], double* nocapture nofree readonly [[SRC:%.*]]) { ; IS__TUNIT_NPM-NEXT: entry: +; IS__TUNIT_NPM-NEXT: [[TMP0:%.*]] = alloca i8, i64 24, align 1 ; IS__TUNIT_NPM-NEXT: [[LOCAL:%.*]] = alloca [[STRUCT_STY:%.*]], align 8 -; IS__TUNIT_NPM-NEXT: [[TMP0:%.*]] = bitcast %struct.STy* [[LOCAL]] to i8* +; IS__TUNIT_NPM-NEXT: [[TMP1:%.*]] = bitcast %struct.STy* [[LOCAL]] to i8* ; IS__TUNIT_NPM-NEXT: [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[LOCAL]], i64 0, i32 2 -; IS__TUNIT_NPM-NEXT: [[TMP1:%.*]] = alloca i8, i64 24, align 1 -; IS__TUNIT_NPM-NEXT: [[DST1:%.*]] = bitcast i8* [[TMP1]] to float** +; IS__TUNIT_NPM-NEXT: [[DST1:%.*]] = bitcast i8* [[TMP0]] to float** ; IS__TUNIT_NPM-NEXT: store float* [[DST]], float** [[DST1]], align 8 -; IS__TUNIT_NPM-NEXT: [[SRC2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 +; IS__TUNIT_NPM-NEXT: [[SRC2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8 ; IS__TUNIT_NPM-NEXT: [[TMP2:%.*]] = bitcast i8* [[SRC2]] to double** ; IS__TUNIT_NPM-NEXT: store double* [[SRC]], double** [[TMP2]], align 8 -; IS__TUNIT_NPM-NEXT: store i8* [[TMP1]], i8** bitcast (%struct.STy** getelementptr inbounds ([[STRUCT_STY]], %struct.STy* @global, i64 0, i32 2) to i8**), align 8 +; IS__TUNIT_NPM-NEXT: store i8* [[TMP0]], i8** bitcast (%struct.STy** getelementptr inbounds ([[STRUCT_STY]], %struct.STy* @global, i64 0, i32 2) to i8**), align 8 ; IS__TUNIT_NPM-NEXT: call fastcc void @nested_memory_callee() #[[ATTR15:[0-9]+]] ; IS__TUNIT_NPM-NEXT: ret void ; diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll index 0feabb7..9afed08 100644 --- a/llvm/test/Transforms/OpenMP/spmdization.ll +++ b/llvm/test/Transforms/OpenMP/spmdization.ll @@ -702,8 +702,8 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__2 ; AMDGPU-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 ; AMDGPU-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4, addrspace(5) +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 ; AMDGPU-NEXT: [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP0]] to i8* ; AMDGPU-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[MALLOC_CAST]] to i32* ; AMDGPU-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR7]] @@ -725,8 +725,8 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__2 ; NVPTX-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 ; NVPTX-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4 +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 ; NVPTX-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32* ; NVPTX-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR7]] ; NVPTX-NEXT: br label [[FOR_COND:%.*]] @@ -747,8 +747,8 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2 ; AMDGPU-DISABLED-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 ; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4, addrspace(5) +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 ; AMDGPU-DISABLED-NEXT: [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP0]] to i8* ; AMDGPU-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[MALLOC_CAST]] to i32* ; AMDGPU-DISABLED-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR7]] @@ -770,8 +770,8 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2 ; NVPTX-DISABLED-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 ; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4 +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 ; NVPTX-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32* ; NVPTX-DISABLED-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR7]] ; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]] -- 2.7.4