[AMDGPU] Limit promote alloca max size in functions

author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Thu, 23 Sep 2021 23:03:48 +0000 (16:03 -0700)

committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Fri, 24 Sep 2021 20:38:39 +0000 (13:38 -0700)
author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Thu, 23 Sep 2021 23:03:48 +0000 (16:03 -0700)
committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Fri, 24 Sep 2021 20:38:39 +0000 (13:38 -0700)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

index 737713b..de1af43 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -21,6 +21,7 @@
  #include "llvm/IR/IntrinsicsR600.h"
  #include "llvm/Pass.h"
  #include "llvm/Target/TargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
  
  #define DEBUG_TYPE "amdgpu-promote-alloca"
  
@@ -176,6 +177,10 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F) {
    if (IsAMDGCN) {
      const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
      MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+    // A non-entry function has only 32 caller preserved registers.
+    // Do not promote alloca which will force spilling.
+    if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+      MaxVGPRs = std::min(MaxVGPRs, 32u);
    } else {
      MaxVGPRs = 128;
    }
@@ -1107,6 +1112,10 @@ bool promoteAllocasToVector(Function &F, TargetMachine &TM) {
    if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
      const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
      MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+    // A non-entry function has only 32 caller preserved registers.
+    // Do not promote alloca which will force spilling.
+    if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+      MaxVGPRs = std::min(MaxVGPRs, 32u);
    } else {
      MaxVGPRs = 128;
    }
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll

index 6235b9e..c1cf6cf 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
@@ -131,6 +131,38 @@ entry:
    ret void
  }
  
+; OPT-LABEL: @alloca_9xi64_max256(
+; OPT-NOT: alloca
+; OPT: <9 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i64>
+define amdgpu_kernel void @alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 {
+entry:
+  %tmp = alloca [9 x i64], addrspace(5)
+  %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
+  store i64 0, i64 addrspace(5)* %x
+  %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i64, i64 addrspace(5)* %tmp1
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @func_alloca_9xi64_max256(
+; OPT: alloca
+; OPT-NOT: <9 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i64>
+define void @func_alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 {
+entry:
+  %tmp = alloca [9 x i64], addrspace(5)
+  %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
+  store i64 0, i64 addrspace(5)* %x
+  %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i64, i64 addrspace(5)* %tmp1
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
+
  attributes #0 = { "amdgpu-flat-work-group-size"="1,1024" }
  attributes #1 = { "amdgpu-flat-work-group-size"="1,512" }
  attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Thu, 23 Sep 2021 23:03:48 +0000 (16:03 -0700)
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Fri, 24 Sep 2021 20:38:39 +0000 (13:38 -0700)
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll		patch \| blob \| history