From: Tom Stellard Date: Thu, 14 Apr 2016 16:27:03 +0000 (+0000) Subject: AMDGPU/SI: Use the correct scratch wave offset register for shaders. X-Git-Tag: llvmorg-3.9.0-rc1~9087 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f110f8f9f7f46c668e03f4808e03aa54c2157269;p=platform%2Fupstream%2Fllvm.git AMDGPU/SI: Use the correct scratch wave offset register for shaders. Summary: The code previously always used s1 as it was using the user + system SGPR information for compute kernels. This is incorrect for Mesa shaders though, The register should be the next SGPR after all user and system SGPR's. We use that Mesa adds arguments for all input and system SGPR's and take the next available SGPR for the scratch wave offset register. Signed-off-by: Bas Nieuwenhuizen Reviewers: mareko, arsenm, nhaehnle, tstellarAMD Subscribers: qcolombet, arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D18941 Patch By: Bas Nieuwenhuizen llvm-svn: 266336 --- diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9aaa31c..ee7ad32 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -37,6 +37,16 @@ using namespace llvm; +static unsigned findFirstFreeSGPR(CCState &CCInfo) { + unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); + for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { + if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { + return AMDGPU::SGPR0 + Reg; + } + } + llvm_unreachable("Cannot allocate sgpr"); +} + SITargetLowering::SITargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI) : AMDGPUTargetLowering(TM, STI) { @@ -712,6 +722,15 @@ SDValue SITargetLowering::LowerFormalArguments( if (!AMDGPU::isShader(CallConv)) { getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, Splits); + + assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); + } else { + assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() && + !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && + !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && + !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && + !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && + !Info->hasWorkItemIDZ()); } // FIXME: How should these inputs interact with inreg / custom SGPR inputs? @@ -834,8 +853,7 @@ SDValue SITargetLowering::LowerFormalArguments( unsigned Reg = Info->addWorkGroupIDX(); MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); CCInfo.AllocateReg(Reg); - } else - llvm_unreachable("work group id x is always enabled"); + } if (Info->hasWorkGroupIDY()) { unsigned Reg = Info->addWorkGroupIDY(); @@ -857,8 +875,13 @@ SDValue SITargetLowering::LowerFormalArguments( if (Info->hasPrivateSegmentWaveByteOffset()) { // Scratch wave offset passed in system SGPR. - unsigned PrivateSegmentWaveByteOffsetReg - = Info->addPrivateSegmentWaveByteOffset(); + unsigned PrivateSegmentWaveByteOffsetReg; + + if (AMDGPU::isShader(CallConv)) { + PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); + Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); + } else + PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset(); MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); @@ -923,8 +946,7 @@ SDValue SITargetLowering::LowerFormalArguments( unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); CCInfo.AllocateReg(Reg); - } else - llvm_unreachable("workitem id x should always be enabled"); + } if (Info->hasWorkItemIDY()) { unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index c56286e..ef413cb 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -65,12 +65,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) GridWorkgroupCountX(false), GridWorkgroupCountY(false), GridWorkgroupCountZ(false), - WorkGroupIDX(true), + WorkGroupIDX(false), WorkGroupIDY(false), WorkGroupIDZ(false), WorkGroupInfo(false), PrivateSegmentWaveByteOffset(false), - WorkItemIDX(true), + WorkItemIDX(false), WorkItemIDY(false), WorkItemIDZ(false) { const AMDGPUSubtarget &ST = MF.getSubtarget(); @@ -80,8 +80,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - if (!AMDGPU::isShader(F->getCallingConv())) + if (!AMDGPU::isShader(F->getCallingConv())) { KernargSegmentPtr = true; + WorkGroupIDX = true; + WorkItemIDX = true; + } if (F->hasFnAttribute("amdgpu-work-group-id-y")) WorkGroupIDY = true; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 6de944a..ac3497c 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -164,6 +164,10 @@ public: return PrivateSegmentWaveByteOffsetSystemSGPR; } + void setPrivateSegmentWaveByteOffset(unsigned Reg) { + PrivateSegmentWaveByteOffsetSystemSGPR = Reg; + } + bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; } diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll index cec1625..eb704c3 100644 --- a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -8,8 +8,8 @@ ; CI: s_mov_b32 s11, 0x98f000 ; VI: s_mov_b32 s11, 0x980000 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen ; ALL: ; ScratchSize: 32772 define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 { @@ -29,8 +29,8 @@ define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 { ; CI: s_mov_b32 s11, 0x98f000 ; VI: s_mov_b32 s11, 0x980000 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen ; ALL: ; ScratchSize: 32772 define amdgpu_ps void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index 28a86df..b755b78 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -11,6 +11,7 @@ ; GCN-LABEL: {{^}}main: +; GCN: s_mov_b32 s11, s12 ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s14, -1