From d04147789ff0b838f9dd6c592207d5f70bc0d025 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Sat, 30 May 2020 11:15:39 +0900 Subject: [PATCH] [AMDGPU] Remove assertion on S1024 SGPR to VGPR spill Summary: Replace an assertion that blocks S1024 SGPR to VGPR spill. The assertion pre-dates S1024 and is not wave size dependent. Reviewers: arsenm, sameerds, rampitec Reviewed By: arsenm Subscribers: qcolombet, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D80783 --- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 11 +- llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll | 244 +++++++++++++++++++++-- 2 files changed, 238 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 2a3ba52..5515e15 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -287,16 +287,19 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, SIMachineFunctionInfo *FuncInfo = MF.getInfo(); unsigned Size = FrameInfo.getObjectSize(FI); - assert(Size >= 4 && Size <= 64 && "invalid sgpr spill size"); - assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs"); + unsigned NumLanes = Size / 4; - int NumLanes = Size / 4; + if (NumLanes > WaveSize) + return false; + + assert(Size >= 4 && "invalid sgpr spill size"); + assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs"); const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); // Make sure to handle the case where a wide SGPR spill may span between two // VGPRs. - for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) { + for (unsigned I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) { Register LaneVGPR; unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize); diff --git a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll index c262f35..f1085cf 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll @@ -193,18 +193,236 @@ ret: ret void } -; FIXME: x16 inlineasm seems broken -; define amdgpu_kernel void @spill_sgpr_x16(i32 addrspace(1)* %out, i32 %in) #0 { -; %wide.sgpr = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 -; %cmp = icmp eq i32 %in, 0 -; br i1 %cmp, label %bb0, label %ret - -; bb0: -; call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr) #0 -; br label %ret - -; ret: -; ret void -; } +; ALL-LABEL: {{^}}spill_sgpr_x16: + +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 4 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 5 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 6 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 7 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 8 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 9 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 10 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 11 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 12 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 13 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 14 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 15 +; VGPR: s_cbranch_scc1 + +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 4 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 5 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 6 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 7 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 8 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 9 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 10 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 11 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 12 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 13 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 14 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 15 + +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: s_cbranch_scc1 + +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +define amdgpu_kernel void @spill_sgpr_x16(i32 addrspace(1)* %out, i32 %in) #0 { + %wide.sgpr = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr) #0 + br label %ret + +ret: + ret void +} + +; ALL-LABEL: {{^}}spill_sgpr_x32: + +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 4 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 5 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 6 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 7 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 8 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 9 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 10 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 11 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 12 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 13 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 14 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 15 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 16 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 17 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 18 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 19 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 20 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 21 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 22 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 23 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 24 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 25 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 26 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 27 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 28 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 29 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 30 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 31 +; VGPR: s_cbranch_scc1 + +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 4 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 5 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 6 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 7 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 8 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 9 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 10 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 11 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 12 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 13 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 14 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 15 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 16 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 17 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 18 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 19 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 20 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 21 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 22 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 23 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 24 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 25 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 26 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 27 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 28 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 29 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 30 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 31 + +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: s_cbranch_scc1 + +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +define amdgpu_kernel void @spill_sgpr_x32(i32 addrspace(1)* %out, i32 %in) #0 { + %wide.sgpr = call <32 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<32 x i32> %wide.sgpr) #0 + br label %ret + +ret: + ret void +} attributes #0 = { nounwind } -- 2.7.4