AMDGPU: Add gfx11 feature to force initializing 16 input SGPRs

author Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 24 Jan 2022 16:35:34 +0000 (11:35 -0500)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Wed, 29 Jun 2022 18:52:19 +0000 (14:52 -0400)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 24 Jan 2022 16:35:34 +0000 (11:35 -0500)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Wed, 29 Jun 2022 18:52:19 +0000 (14:52 -0400)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td

index 86077e0..48b5814 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -177,6 +177,12 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
    "VI SGPR initialization bug requiring a fixed SGPR allocation size"
  >;
  
+def FeatureUserSGPRInit16Bug : SubtargetFeature<"user-sgpr-init16-bug",
+  "UserSGPRInit16Bug",
+  "true",
+  "Bug requiring at least 16 user+system SGPRs to be enabled"
+>;
+
  def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug",
    "LDSMisalignedBug",
    "true",
@@ -1262,11 +1268,11 @@ def FeatureISAVersion11_Common : FeatureSet<
  // Features for GFX 11.0.0 and 11.0.1
  def FeatureISAVersion11_0 : FeatureSet<
    !listconcat(FeatureISAVersion11_Common.Features,
-    [])>;
+    [FeatureUserSGPRInit16Bug])>;
  
  def FeatureISAVersion11_0_2 : FeatureSet<
    !listconcat(FeatureISAVersion11_Common.Features,
-    [])>;
+    [FeatureUserSGPRInit16Bug])>;
  
  //===----------------------------------------------------------------------===//
  
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

index 959d1d7..57a4660 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -900,8 +900,14 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
    else if (MFI->hasWorkItemIDY())
      TIDIGCompCnt = 1;
  
+  // The private segment wave byte offset is the last of the system SGPRs. We
+  // initially assumed it was allocated, and may have used it. It shouldn't harm
+  // anything to disable it if we know the stack isn't used here. We may still
+  // have emitted code reading it to initialize scratch, but if that's unused
+  // reading garbage should be OK.
+  const bool EnablePrivateSegment = ProgInfo.ScratchBlocks > 0;
    ProgInfo.ComputePGMRSrc2 =
-      S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
+      S_00B84C_SCRATCH_EN(EnablePrivateSegment) |
        S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
        // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
        S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h

index 4f54e76..d269d09 100644 (file)
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -107,6 +107,7 @@ protected:
    bool GFX10_3Insts = false;
    bool GFX7GFX8GFX9Insts = false;
    bool SGPRInitBug = false;
+  bool UserSGPRInit16Bug = false;
    bool NegativeScratchOffsetBug = false;
    bool NegativeUnalignedScratchOffsetBug = false;
    bool HasSMemRealTime = false;
@@ -928,6 +929,10 @@ public:
      return SGPRInitBug;
    }
  
+  bool hasUserSGPRInit16Bug() const {
+    return UserSGPRInit16Bug;
+  }
+
    bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
  
    bool hasNegativeUnalignedScratchOffsetBug() const {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

index 946a54b..a452273 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2099,6 +2099,24 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
                                             SIMachineFunctionInfo &Info,
                                             CallingConv::ID CallConv,
                                             bool IsShader) const {
+  if (Subtarget->hasUserSGPRInit16Bug()) {
+    // Pad up the used user SGPRs with dead inputs.
+    unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
+
+    // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
+    // rely on it to reach 16 since if we end up having no stack usage, it will
+    // not really be added.
+    unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
+                                      Info.hasWorkGroupIDY() +
+                                      Info.hasWorkGroupIDZ() +
+                                      Info.hasWorkGroupInfo();
+    for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
+      Register Reg = Info.addReservedUserSGPR();
+      MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+      CCInfo.AllocateReg(Reg);
+    }
+  }
+
    if (Info.hasWorkGroupIDX()) {
      Register Reg = Info.addWorkGroupIDX();
      MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
@@ -2143,6 +2161,8 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
      MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
      CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
    }
+
+  assert(!Subtarget->hasUserSGPRInit16Bug() || Info.getNumPreloadedSGPRs() >= 16);
  }
  
  static void reservePrivateMemoryRegs(const TargetMachine &TM,
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

index 8a63002..bebb13c 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -609,6 +609,13 @@ public:
    Register addFlatScratchInit(const SIRegisterInfo &TRI);
    Register addImplicitBufferPtr(const SIRegisterInfo &TRI);
  
+  /// Increment user SGPRs used for padding the argument list only.
+  Register addReservedUserSGPR() {
+    Register Next = getNextUserSGPR();
+    ++NumUserSGPRs;
+    return Next;
+  }
+
    // Add system SGPRs.
    Register addWorkGroupIDX() {
      ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll

new file mode 100644 (file)

index 0000000..2dcdd31
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
@@ -0,0 +1,162 @@
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 < %s | FileCheck -check-prefix=GCN %s
+
+; There aren't any stack objects, but we still enable the
+; private_segment_wavefront_offset to get to 16, and the workgroup ID
+; is in s14.
+
+; private_segment_buffer + workgroup_id_x = 5, + 11 padding
+
+; GCN-LABEL: {{^}}minimal_kernel_inputs:
+; GCN: v_mov_b32_e32 [[V:v[0-9]+]], s15
+; GCN-NEXT: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V]], off
+
+; GCN: .amdhsa_kernel minimal_kernel_inputs
+; GCN: .amdhsa_user_sgpr_count 15
+; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; GCN-NEXT: .amdhsa_wavefront_size32 1
+; GCN-NEXT: .amdhsa_enable_private_segment 0
+; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
+; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
+; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
+; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
+; GCN: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15
+define amdgpu_kernel void @minimal_kernel_inputs() {
+  %id = call i32 @llvm.amdgcn.workgroup.id.x()
+  store volatile i32 %id, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}minimal_kernel_inputs_with_stack:
+; GCN: v_mov_b32_e32 [[V:v[0-9]+]], s15
+; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V]], off
+
+; GCN: .amdhsa_kernel minimal_kernel_inputs
+; GCN: .amdhsa_user_sgpr_count 15
+; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; GCN-NEXT: .amdhsa_wavefront_size32 1
+; GCN-NEXT: .amdhsa_enable_private_segment 1
+; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
+; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
+; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
+; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
+; GCN: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15
+define amdgpu_kernel void @minimal_kernel_inputs_with_stack() {
+  %alloca = alloca i32, addrspace(5)
+  %id = call i32 @llvm.amdgcn.workgroup.id.x()
+  store volatile i32 %id, i32 addrspace(1)* undef
+  store volatile i32 0, i32 addrspace(5)* %alloca
+  ret void
+}
+
+; GCN-LABEL: {{^}}queue_ptr:
+; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1]
+
+; GCN: v_mov_b32_e32 [[V:v[0-9]+]], s15
+; GCN-NEXT: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V]], off
+
+; GCN: .amdhsa_kernel queue_ptr
+; GCN: .amdhsa_user_sgpr_count 15
+; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1
+; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; GCN-NEXT: .amdhsa_wavefront_size32 1
+; GCN-NEXT: .amdhsa_enable_private_segment 0
+; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
+; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
+; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
+; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
+; GCN: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15
+define amdgpu_kernel void @queue_ptr() {
+  %queue.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+  %load = load volatile i8, i8 addrspace(4)* %queue.ptr
+  %id = call i32 @llvm.amdgcn.workgroup.id.x()
+  store volatile i32 %id, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}all_inputs:
+; GCN: v_mov_b32_e32 [[V_X:v[0-9]+]], s13
+; GCN: v_mov_b32_e32 [[V_Y:v[0-9]+]], s14
+; GCN: v_mov_b32_e32 [[V_Z:v[0-9]+]], s15
+
+; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1]
+; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[2:3]
+; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[4:5]
+
+; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_LO:[0-9]+]], s6
+; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_HI:[0-9]+]], s7
+
+; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V_X]], off
+; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V_Y]], off
+; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V_Z]], off
+; GCN: global_store_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[DISPATCH_LO]]:[[DISPATCH_HI]]{{\]}}, off
+
+; GCN: .amdhsa_kernel all_inputs
+; GCN: .amdhsa_user_sgpr_count 13
+; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
+; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1
+; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
+; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 1
+; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; GCN-NEXT: .amdhsa_wavefront_size32 1
+; GCN-NEXT: .amdhsa_enable_private_segment 1
+; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
+; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
+; GCN: ; COMPUTE_PGM_RSRC2:USER_SGPR: 13
+define amdgpu_kernel void @all_inputs() {
+  %alloca = alloca i32, addrspace(5)
+  store volatile i32 0, i32 addrspace(5)* %alloca
+
+  %dispatch.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %load.dispatch = load volatile i8, i8 addrspace(4)* %dispatch.ptr
+
+  %queue.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr()
+  %load.queue = load volatile i8, i8 addrspace(4)* %queue.ptr
+
+  %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %load.implicitarg = load volatile i8, i8 addrspace(4)* %implicitarg.ptr
+
+  %id.x = call i32 @llvm.amdgcn.workgroup.id.x()
+  store volatile i32 %id.x, i32 addrspace(1)* undef
+
+  %id.y = call i32 @llvm.amdgcn.workgroup.id.y()
+  store volatile i32 %id.y, i32 addrspace(1)* undef
+
+  %id.z = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %id.z, i32 addrspace(1)* undef
+
+  %dispatch.id = call i64 @llvm.amdgcn.dispatch.id()
+  store volatile i64 %dispatch.id, i64 addrspace(1)* undef
+
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+declare i32 @llvm.amdgcn.workgroup.id.y() #0
+declare i32 @llvm.amdgcn.workgroup.id.z() #0
+declare align 4 i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
+declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+declare align 4 i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+declare align 4 i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
+declare i64 @llvm.amdgcn.dispatch.id() #0
+
+attributes #0 = { nounwind readnone speculatable willreturn }
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 24 Jan 2022 16:35:34 +0000 (11:35 -0500)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Wed, 29 Jun 2022 18:52:19 +0000 (14:52 -0400)
llvm/lib/Target/AMDGPU/AMDGPU.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/GCNSubtarget.h		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll	[new file with mode: 0644]	patch \| blob