AMDGPU/SI: Use the correct scratch wave offset register for shaders.

author Tom Stellard <thomas.stellard@amd.com>

Thu, 14 Apr 2016 16:27:03 +0000 (16:27 +0000)

committer Tom Stellard <thomas.stellard@amd.com>

Thu, 14 Apr 2016 16:27:03 +0000 (16:27 +0000)
author Tom Stellard <thomas.stellard@amd.com>
Thu, 14 Apr 2016 16:27:03 +0000 (16:27 +0000)
committer Tom Stellard <thomas.stellard@amd.com>
Thu, 14 Apr 2016 16:27:03 +0000 (16:27 +0000)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

index 9aaa31c..ee7ad32 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -37,6 +37,16 @@
  
  using namespace llvm;
  
+static unsigned findFirstFreeSGPR(CCState &CCInfo) {
+  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
+    if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
+      return AMDGPU::SGPR0 + Reg;
+    }
+  }
+  llvm_unreachable("Cannot allocate sgpr");
+}
+
  SITargetLowering::SITargetLowering(TargetMachine &TM,
                                     const AMDGPUSubtarget &STI)
      : AMDGPUTargetLowering(TM, STI) {
@@ -712,6 +722,15 @@ SDValue SITargetLowering::LowerFormalArguments(
    if (!AMDGPU::isShader(CallConv)) {
      getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
                              Splits);
+
+    assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
+  } else {
+    assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
+           !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
+           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
+           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
+           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
+           !Info->hasWorkItemIDZ());
    }
  
    // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
@@ -834,8 +853,7 @@ SDValue SITargetLowering::LowerFormalArguments(
      unsigned Reg = Info->addWorkGroupIDX();
      MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
      CCInfo.AllocateReg(Reg);
-  } else
-    llvm_unreachable("work group id x is always enabled");
+  }
  
    if (Info->hasWorkGroupIDY()) {
      unsigned Reg = Info->addWorkGroupIDY();
@@ -857,8 +875,13 @@ SDValue SITargetLowering::LowerFormalArguments(
  
    if (Info->hasPrivateSegmentWaveByteOffset()) {
      // Scratch wave offset passed in system SGPR.
-    unsigned PrivateSegmentWaveByteOffsetReg
-      = Info->addPrivateSegmentWaveByteOffset();
+    unsigned PrivateSegmentWaveByteOffsetReg;
+
+    if (AMDGPU::isShader(CallConv)) {
+      PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
+      Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
+    } else
+      PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset();
  
      MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
      CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
@@ -923,8 +946,7 @@ SDValue SITargetLowering::LowerFormalArguments(
      unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
      MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
      CCInfo.AllocateReg(Reg);
-  } else
-    llvm_unreachable("workitem id x should always be enabled");
+  }
  
    if (Info->hasWorkItemIDY()) {
      unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

index c56286e..ef413cb 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -65,12 +65,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
      GridWorkgroupCountX(false),
      GridWorkgroupCountY(false),
      GridWorkgroupCountZ(false),
-    WorkGroupIDX(true),
+    WorkGroupIDX(false),
      WorkGroupIDY(false),
      WorkGroupIDZ(false),
      WorkGroupInfo(false),
      PrivateSegmentWaveByteOffset(false),
-    WorkItemIDX(true),
+    WorkItemIDX(false),
      WorkItemIDY(false),
      WorkItemIDZ(false) {
    const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
@@ -80,8 +80,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
  
    const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
  
-  if (!AMDGPU::isShader(F->getCallingConv()))
+  if (!AMDGPU::isShader(F->getCallingConv())) {
      KernargSegmentPtr = true;
+    WorkGroupIDX = true;
+    WorkItemIDX = true;
+  }
  
    if (F->hasFnAttribute("amdgpu-work-group-id-y"))
      WorkGroupIDY = true;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

index 6de944a..ac3497c 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -164,6 +164,10 @@ public:
      return PrivateSegmentWaveByteOffsetSystemSGPR;
    }
  
+  void setPrivateSegmentWaveByteOffset(unsigned Reg) {
+    PrivateSegmentWaveByteOffsetSystemSGPR = Reg;
+  }
+
    bool hasPrivateSegmentBuffer() const {
      return PrivateSegmentBuffer;
    }
diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll

index cec1625..eb704c3 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll
+++ b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll
@@ -8,8 +8,8 @@
  ; CI: s_mov_b32 s11, 0x98f000
  ; VI: s_mov_b32 s11, 0x980000
  
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
  
  ; ALL: ; ScratchSize: 32772
  define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 {
@@ -29,8 +29,8 @@ define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 {
  ; CI: s_mov_b32 s11, 0x98f000
  ; VI: s_mov_b32 s11, 0x980000
  
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
  
  ; ALL: ; ScratchSize: 32772
  define amdgpu_ps void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll

index 28a86df..b755b78 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -11,6 +11,7 @@
  
  ; GCN-LABEL: {{^}}main:
  
+; GCN: s_mov_b32 s11, s12
  ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
  ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
  ; GCN-NEXT: s_mov_b32 s14, -1
author	Tom Stellard <thomas.stellard@amd.com>
	Thu, 14 Apr 2016 16:27:03 +0000 (16:27 +0000)
committer	Tom Stellard <thomas.stellard@amd.com>
	Thu, 14 Apr 2016 16:27:03 +0000 (16:27 +0000)
llvm/lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll		patch \| blob \| history