From: Stanislav Mekhanoshin Date: Thu, 23 Sep 2021 23:03:48 +0000 (-0700) Subject: [AMDGPU] Limit promote alloca max size in functions X-Git-Tag: upstream/15.0.7~30550 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=cf74ef134c9a8089d8997144d54628952c6d4552;p=platform%2Fupstream%2Fllvm.git [AMDGPU] Limit promote alloca max size in functions Non-entry functions have 32 caller saved VGPRs available. If we promote alloca to consume more registers we will have to spill CSRs. There is no reason to eliminate scratch access to get another scratch access instead. Differential Revision: https://reviews.llvm.org/D110372 --- diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 737713b..de1af43 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Pass.h" #include "llvm/Target/TargetMachine.h" +#include "Utils/AMDGPUBaseInfo.h" #define DEBUG_TYPE "amdgpu-promote-alloca" @@ -176,6 +177,10 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F) { if (IsAMDGCN) { const GCNSubtarget &ST = TM.getSubtarget(F); MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); + // A non-entry function has only 32 caller preserved registers. + // Do not promote alloca which will force spilling. + if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) + MaxVGPRs = std::min(MaxVGPRs, 32u); } else { MaxVGPRs = 128; } @@ -1107,6 +1112,10 @@ bool promoteAllocasToVector(Function &F, TargetMachine &TM) { if (TM.getTargetTriple().getArch() == Triple::amdgcn) { const GCNSubtarget &ST = TM.getSubtarget(F); MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); + // A non-entry function has only 32 caller preserved registers. + // Do not promote alloca which will force spilling. + if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) + MaxVGPRs = std::min(MaxVGPRs, 32u); } else { MaxVGPRs = 128; } diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll index 6235b9e..c1cf6cf 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll @@ -131,6 +131,38 @@ entry: ret void } +; OPT-LABEL: @alloca_9xi64_max256( +; OPT-NOT: alloca +; OPT: <9 x i64> +; LIMIT32: alloca +; LIMIT32-NOT: <9 x i64> +define amdgpu_kernel void @alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 { +entry: + %tmp = alloca [9 x i64], addrspace(5) + %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0 + store i64 0, i64 addrspace(5)* %x + %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i64, i64 addrspace(5)* %tmp1 + store i64 %tmp2, i64 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @func_alloca_9xi64_max256( +; OPT: alloca +; OPT-NOT: <9 x i64> +; LIMIT32: alloca +; LIMIT32-NOT: <9 x i64> +define void @func_alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 { +entry: + %tmp = alloca [9 x i64], addrspace(5) + %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0 + store i64 0, i64 addrspace(5)* %x + %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i64, i64 addrspace(5)* %tmp1 + store i64 %tmp2, i64 addrspace(1)* %out + ret void +} + attributes #0 = { "amdgpu-flat-work-group-size"="1,1024" } attributes #1 = { "amdgpu-flat-work-group-size"="1,512" } attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }