From: Matt Arsenault Date: Wed, 2 Aug 2017 01:52:45 +0000 (+0000) Subject: AMDGPU: Fix clobbering CSR VGPRs when spilling SGPR to it X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8e8f8f43b043b1839973fcc28694ca8d220a2137;p=platform%2Fupstream%2Fllvm.git AMDGPU: Fix clobbering CSR VGPRs when spilling SGPR to it llvm-svn: 309783 --- diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 79bae0a..f7e5cb0 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -454,6 +454,15 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, .addImm(NumBytes * ST.getWavefrontSize()) .setMIFlag(MachineInstr::FrameSetup); } + + for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg + : FuncInfo->getSGPRSpillVGPRs()) { + if (!Reg.FI.hasValue()) + continue; + TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true, + Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, + &TII->getRegisterInfo()); + } } void SIFrameLowering::emitEpilogue(MachineFunction &MF, @@ -462,6 +471,19 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, if (FuncInfo->isEntryFunction()) return; + const SISubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + + for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg + : FuncInfo->getSGPRSpillVGPRs()) { + if (!Reg.FI.hasValue()) + continue; + TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR, + Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, + &TII->getRegisterInfo()); + } + unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); if (StackPtrReg == AMDGPU::NoRegister) return; @@ -469,9 +491,6 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, const MachineFrameInfo &MFI = MF.getFrameInfo(); uint32_t NumBytes = MFI.getStackSize(); - const SISubtarget &ST = MF.getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); - MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); DebugLoc DL; // FIXME: Clarify distinction between no set SP and SP. For callee functions, diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 04e57be..cfc9fe5 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -237,6 +237,15 @@ unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) return ImplicitBufferPtrUserSGPR; } +static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) { + for (unsigned I = 0; CSRegs[I]; ++I) { + if (CSRegs[I] == Reg) + return true; + } + + return false; +} + /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI. bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, int FI) { @@ -258,6 +267,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, int NumLanes = Size / 4; + const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF); + // Make sure to handle the case where a wide SGPR spill may span between two // VGPRs. for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) { @@ -274,14 +285,21 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, return false; } - SpillVGPRs.push_back(LaneVGPR); + Optional CSRSpillFI; + if (FrameInfo.hasCalls() && CSRegs && isCalleeSavedReg(CSRegs, LaneVGPR)) { + // TODO: Should this be a CreateSpillStackObject? This is technically a + // weird CSR spill. + CSRSpillFI = FrameInfo.CreateStackObject(4, 4, false); + } + + SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI)); // Add this register as live-in to all blocks to avoid machine verifer // complaining about use of an undefined physical register. for (MachineBasicBlock &BB : MF) BB.addLiveIn(LaneVGPR); } else { - LaneVGPR = SpillVGPRs.back(); + LaneVGPR = SpillVGPRs.back().VGPR; } SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex)); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 8511403..94145c4 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -211,6 +211,19 @@ public: bool hasReg() { return VGPR != AMDGPU::NoRegister;} }; + struct SGPRSpillVGPRCSR { + // VGPR used for SGPR spills + unsigned VGPR; + + // If the VGPR is a CSR, the stack slot used to save/restore it in the + // prolog/epilog. + Optional FI; + + SGPRSpillVGPRCSR(unsigned V, Optional F) : + VGPR(V), + FI(F) {} + }; + private: // SGPR->VGPR spilling support. typedef std::pair SpillRegMask; @@ -219,7 +232,7 @@ private: // frameindex key. DenseMap> SGPRToVGPRSpills; unsigned NumVGPRSpillLanes = 0; - SmallVector SpillVGPRs; + SmallVector SpillVGPRs; public: @@ -231,6 +244,10 @@ public: ArrayRef() : makeArrayRef(I->second); } + ArrayRef getSGPRSpillVGPRs() const { + return SpillVGPRs; + } + bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI); diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll index 44648df..6ae5aab 100644 --- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -30,10 +30,11 @@ entry: ; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf: ; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v32 +; GCN-DAG: buffer_store_dword v32 +; GCN-DAG: buffer_store_dword v33 ; GCN: v_writelane_b32 -; GCN-DAG: s_add_u32 s32, s32, 0xa00{{$}} +; GCN-DAG: s_add_u32 s32, s32, 0xb00{{$}} ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}} ; GCN: v_add_i32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]] @@ -48,7 +49,8 @@ entry: ; GCN: v_readlane_b32 ; GCN: buffer_load_dword v32, -; GCN: s_sub_u32 s32, s32, 0xa00{{$}} +; GCN: buffer_load_dword v33, +; GCN: s_sub_u32 s32, s32, 0xb00{{$}} ; GCN: s_setpc_b64 define void @void_func_byval_struct_non_leaf(%struct.ByValStruct* byval noalias nocapture align 4 %arg0, %struct.ByValStruct* byval noalias nocapture align 4 %arg1) #1 { entry: @@ -67,7 +69,7 @@ entry: ; GCN-LABEL: {{^}}call_void_func_byval_struct_func: ; GCN: s_mov_b32 s5, s32 -; GCN: s_add_u32 s32, s32, 0xa00{{$}} +; GCN: s_add_u32 s32, s32, 0xc00{{$}} ; GCN: v_writelane_b32 ; GCN-DAG: s_add_u32 s32, s32, 0x800{{$}} @@ -103,7 +105,7 @@ entry: ; GCN: v_readlane_b32 -; GCN: s_sub_u32 s32, s32, 0xa00{{$}} +; GCN: s_sub_u32 s32, s32, 0xc00{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @call_void_func_byval_struct_func() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll index 6061663..7c39831 100644 --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -146,7 +146,7 @@ define void @use_stack1() #1 { } ; GCN-LABEL: {{^}}indirect_use_stack: -; GCN: ScratchSize: 2120 +; GCN: ScratchSize: 2124 define void @indirect_use_stack() #1 { %alloca = alloca [16 x i32], align 4 call void asm sideeffect "; use $0", "v"([16 x i32]* %alloca) #0 @@ -156,7 +156,7 @@ define void @indirect_use_stack() #1 { ; GCN-LABEL: {{^}}indirect_2_level_use_stack: ; GCN: is_dynamic_callstack = 0 -; GCN: ScratchSize: 2120 +; GCN: ScratchSize: 2124 define amdgpu_kernel void @indirect_2_level_use_stack() #0 { call void @indirect_use_stack() ret void @@ -199,7 +199,7 @@ define amdgpu_kernel void @usage_external_recurse() #0 { } ; GCN-LABEL: {{^}}direct_recursion_use_stack: -; GCN: ScratchSize: 2052 +; GCN: ScratchSize: 2056 define void @direct_recursion_use_stack(i32 %val) #2 { %alloca = alloca [512 x i32], align 4 call void asm sideeffect "; use $0", "v"([512 x i32]* %alloca) #0 @@ -218,7 +218,7 @@ ret: ; GCN-LABEL: {{^}}usage_direct_recursion: ; GCN: is_ptr64 = 1 ; GCN: is_dynamic_callstack = 1 -; GCN: workitem_private_segment_byte_size = 2052 +; GCN: workitem_private_segment_byte_size = 2056 define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 { call void @direct_recursion_use_stack(i32 %n) ret void diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll index a07199c..d0edcf8 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -36,14 +36,15 @@ define void @callee_with_stack() #0 { ; GCN-LABEL: {{^}}callee_with_stack_and_call: ; GCN: ; BB#0: ; GCN-NEXT: s_waitcnt +; GCN: s_mov_b32 s5, s32 +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 -; GCN-DAG: s_mov_b32 s5, s32 ; GCN-DAG: v_writelane_b32 v32, s33, ; GCN-DAG: v_writelane_b32 v32, s34, ; GCN-DAG: v_writelane_b32 v32, s35, -; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}} -; GCN-DAG: s_add_u32 s32, s32, 0x200{{$}} +; GCN-DAG: s_add_u32 s32, s32, 0x300{{$}} ; GCN-DAG: v_mov_b32_e32 v0, 0{{$}} +; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}} ; GCN-DAG: s_mov_b32 s33, s5 @@ -52,6 +53,7 @@ define void @callee_with_stack() #0 { ; GCN-DAG: v_readlane_b32 s35, ; GCN-DAG: v_readlane_b32 s34, ; GCN-DAG: v_readlane_b32 s33, +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; GCN: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_and_call() #0 { @@ -64,13 +66,24 @@ define void @callee_with_stack_and_call() #0 { ; Should be able to copy incoming stack pointer directly to inner ; call's stack pointer argument. +; There is stack usage only because of the need to evict a VGPR for +; spilling CSR SGPRs. + ; GCN-LABEL: {{^}}callee_no_stack_with_call: ; GCN: s_waitcnt -; GCN-NOT: s32 +; GCN: s_mov_b32 s5, s32 +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 +; GCN-DAG: v_writelane_b32 v32, s33, 0 +; GCN-DAG: v_writelane_b32 v32, s34, 1 ; GCN: s_mov_b32 s33, s5 ; GCN: s_swappc_b64 ; GCN: s_mov_b32 s5, s33 -; GCN-NOT: s32 + +; GCN-DAG: v_readlane_b32 s34, v32, 1 +; GCN-DAG: v_readlane_b32 s33, v32, 0 +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 +; GCN: s_sub_u32 s32, s32, 0x200 + ; GCN: s_setpc_b64 define void @callee_no_stack_with_call() #0 { call void @external_void_func_void() diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll index 2d8d666..f8ce818 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -9,9 +9,21 @@ declare void @external_void_func_i32(i32) #0 ; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm: ; GCN: s_waitcnt -; GCN-NOT: s32 +; GCN: s_mov_b32 s5, s32 +; Spill CSR VGPR used for SGPR spilling +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 +; GCN-DAG: s_add_u32 s32, s32, 0x200 +; GCN-DAG: v_writelane_b32 v32, s33, 0 +; GCN-DAG: v_writelane_b32 v32, s34, 1 +; GCN-DAG: v_writelane_b32 v32, s35, 2 + ; GCN: s_swappc_b64 -; GCN-NOT: s32 + +; GCN: v_readlane_b32 s35, v32, 2 +; GCN: v_readlane_b32 s34, v32, 1 +; GCN: v_readlane_b32 s33, v32, 0 +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 +; GCN: s_sub_u32 s32, s32, 0x200 ; GCN: s_setpc_b64 define void @test_func_call_external_void_func_i32_imm() #0 { call void @external_void_func_i32(i32 42) @@ -21,10 +33,10 @@ define void @test_func_call_external_void_func_i32_imm() #0 { ; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use: ; GCN: s_waitcnt ; GCN: s_mov_b32 s5, s32 -; GCN: s_add_u32 s32, s32, 0x1100{{$}} +; GCN: s_add_u32 s32, s32, 0x1200{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset ; GCN: s_swappc_b64 -; GCN: s_sub_u32 s32, s32, 0x1100{{$}} +; GCN: s_sub_u32 s32, s32, 0x1200{{$}} ; GCN: s_setpc_b64 define void @test_func_call_external_void_func_i32_imm_stack_use() #0 { %alloca = alloca [16 x i32], align 4