From: Tom Stellard <thomas.stellard@amd.com>
Date: Thu, 14 Apr 2016 16:27:03 +0000 (+0000)
Subject: AMDGPU/SI: Use the correct scratch wave offset register for shaders.
X-Git-Tag: llvmorg-3.9.0-rc1~9087
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f110f8f9f7f46c668e03f4808e03aa54c2157269;p=platform%2Fupstream%2Fllvm.git

AMDGPU/SI: Use the correct scratch wave offset register for shaders.

Summary:
The code previously always used s1 as it was using the user + system SGPR
information for compute kernels. This is incorrect for Mesa shaders though,

The register should be the next SGPR after all user and system SGPR's.
We use that Mesa adds arguments for all input and system SGPR's and
take the next available SGPR for the scratch wave offset register.

Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>

Reviewers: mareko, arsenm, nhaehnle, tstellarAMD

Subscribers: qcolombet, arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D18941

Patch By: Bas Nieuwenhuizen

llvm-svn: 266336
---

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9aaa31c..ee7ad32 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -37,6 +37,16 @@
 
 using namespace llvm;
 
+static unsigned findFirstFreeSGPR(CCState &CCInfo) {
+  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
+    if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
+      return AMDGPU::SGPR0 + Reg;
+    }
+  }
+  llvm_unreachable("Cannot allocate sgpr");
+}
+
 SITargetLowering::SITargetLowering(TargetMachine &TM,
                                    const AMDGPUSubtarget &STI)
     : AMDGPUTargetLowering(TM, STI) {
@@ -712,6 +722,15 @@ SDValue SITargetLowering::LowerFormalArguments(
   if (!AMDGPU::isShader(CallConv)) {
     getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
                             Splits);
+
+    assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
+  } else {
+    assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
+           !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
+           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
+           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
+           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
+           !Info->hasWorkItemIDZ());
   }
 
   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
@@ -834,8 +853,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     unsigned Reg = Info->addWorkGroupIDX();
     MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
     CCInfo.AllocateReg(Reg);
-  } else
-    llvm_unreachable("work group id x is always enabled");
+  }
 
   if (Info->hasWorkGroupIDY()) {
     unsigned Reg = Info->addWorkGroupIDY();
@@ -857,8 +875,13 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   if (Info->hasPrivateSegmentWaveByteOffset()) {
     // Scratch wave offset passed in system SGPR.
-    unsigned PrivateSegmentWaveByteOffsetReg
-      = Info->addPrivateSegmentWaveByteOffset();
+    unsigned PrivateSegmentWaveByteOffsetReg;
+
+    if (AMDGPU::isShader(CallConv)) {
+      PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
+      Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
+    } else
+      PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset();
 
     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
@@ -923,8 +946,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
     CCInfo.AllocateReg(Reg);
-  } else
-    llvm_unreachable("workitem id x should always be enabled");
+  }
 
   if (Info->hasWorkItemIDY()) {
     unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index c56286e..ef413cb 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -65,12 +65,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     GridWorkgroupCountX(false),
     GridWorkgroupCountY(false),
     GridWorkgroupCountZ(false),
-    WorkGroupIDX(true),
+    WorkGroupIDX(false),
     WorkGroupIDY(false),
     WorkGroupIDZ(false),
     WorkGroupInfo(false),
     PrivateSegmentWaveByteOffset(false),
-    WorkItemIDX(true),
+    WorkItemIDX(false),
     WorkItemIDY(false),
     WorkItemIDZ(false) {
   const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
@@ -80,8 +80,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
 
   const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
 
-  if (!AMDGPU::isShader(F->getCallingConv()))
+  if (!AMDGPU::isShader(F->getCallingConv())) {
     KernargSegmentPtr = true;
+    WorkGroupIDX = true;
+    WorkItemIDX = true;
+  }
 
   if (F->hasFnAttribute("amdgpu-work-group-id-y"))
     WorkGroupIDY = true;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 6de944a..ac3497c 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -164,6 +164,10 @@ public:
     return PrivateSegmentWaveByteOffsetSystemSGPR;
   }
 
+  void setPrivateSegmentWaveByteOffset(unsigned Reg) {
+    PrivateSegmentWaveByteOffsetSystemSGPR = Reg;
+  }
+
   bool hasPrivateSegmentBuffer() const {
     return PrivateSegmentBuffer;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll
index cec1625..eb704c3 100644
--- a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll
+++ b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll
@@ -8,8 +8,8 @@
 ; CI: s_mov_b32 s11, 0x98f000
 ; VI: s_mov_b32 s11, 0x980000
 
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
 
 ; ALL: ; ScratchSize: 32772
 define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 {
@@ -29,8 +29,8 @@ define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 {
 ; CI: s_mov_b32 s11, 0x98f000
 ; VI: s_mov_b32 s11, 0x980000
 
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
 
 ; ALL: ; ScratchSize: 32772
 define amdgpu_ps void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 28a86df..b755b78 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -11,6 +11,7 @@
 
 ; GCN-LABEL: {{^}}main:
 
+; GCN: s_mov_b32 s11, s12
 ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN-NEXT: s_mov_b32 s14, -1