From 88aced1e454195e038560abb3a0732d020aa4295 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 2 Mar 2020 09:43:06 -0500
Subject: [PATCH] AMDGPU: Fix computation for getOccupancyWithLocalMemSize

The computation here didn't really make sense to me, and reported
wildy different results depending on the flat work group size
attribute.

I think this should really report a range derived from the possible
work group size bounds, and only allow an occupancy that is a multiple
of the group size.
---
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp   | 41 ++++++++++---
 llvm/test/CodeGen/AMDGPU/occupancy-levels.ll | 89 ++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index fb488d2..248c3cb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -328,18 +328,41 @@ unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 }
 
+// FIXME: Should return min,max range.
 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
   const Function &F) const {
-  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
-  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
-  if (!WorkGroupsPerCu)
+  const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
+  const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
+  if (!MaxWorkGroupsPerCu)
     return 0;
-  unsigned MaxWaves = getMaxWavesPerEU();
-  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
-  unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
-  NumWaves = std::min(NumWaves, MaxWaves);
-  NumWaves = std::max(NumWaves, 1u);
-  return NumWaves;
+
+  const unsigned WaveSize = getWavefrontSize();
+
+  // FIXME: Do we need to account for alignment requirement of LDS rounding the
+  // size up?
+  // Compute restriction based on LDS usage
+  unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
+
+  // This can be queried with more LDS than is possible, so just assume the
+  // worst.
+  if (NumGroups == 0)
+    return 1;
+
+  NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
+
+  // Round to the number of waves.
+  const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
+  unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
+
+  // Clamp to the maximum possible number of waves.
+  MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
+
+  // FIXME: Needs to be a multiple of the group size?
+  //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
+
+  assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
+         "computed invalid occupancy");
+  return MaxWaves;
 }
 
 unsigned
diff --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
index eae3f11..db70c3d 100644
--- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
+++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
@@ -283,6 +283,95 @@ define amdgpu_kernel void @used_lds_13112() {
   ret void
 }
 
+; GCN-LABEL: {{^}}used_lds_8252_max_group_size_64:
+; GFX9:       ; Occupancy: 7{{$}}
+; GFX101064:    ; Occupancy: 7{{$}}
+; GFX1010W32:    ; Occupancy: 14{{$}}
+@lds8252 = internal addrspace(3) global [8252 x i8] undef, align 4
+define amdgpu_kernel void @used_lds_8252_max_group_size_64() #3 {
+  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
+  store volatile i8 1, i8 addrspace(3)* %p
+  ret void
+}
+
+; GCN-LABEL: {{^}}used_lds_8252_max_group_size_96:
+; GFX9:       ; Occupancy: 10{{$}}
+; GFX1010W64:    ; Occupancy: 14{{$}}
+; GFX1010W32:    ; Occupancy: 20{{$}}
+define amdgpu_kernel void @used_lds_8252_max_group_size_96() #4 {
+  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
+  store volatile i8 1, i8 addrspace(3)* %p
+  ret void
+}
+
+; GCN-LABEL: {{^}}used_lds_8252_max_group_size_128:
+; GFX9:       ; Occupancy: 10{{$}}
+; GFX1010W64:    ; Occupancy: 14{{$}}
+; GFX1010W32:    ; Occupancy: 20{{$}}
+define amdgpu_kernel void @used_lds_8252_max_group_size_128() #5 {
+  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
+  store volatile i8 1, i8 addrspace(3)* %p
+  ret void
+}
+
+; GCN-LABEL: {{^}}used_lds_8252_max_group_size_192:
+; GFX9:       ; Occupancy: 10{{$}}
+; GFX1010W64:    ; Occupancy: 20{{$}}
+; GFX1010W32:    ; Occupancy: 20{{$}}
+define amdgpu_kernel void @used_lds_8252_max_group_size_192() #6 {
+  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
+  store volatile i8 1, i8 addrspace(3)* %p
+  ret void
+}
+
+; GCN-LABEL: {{^}}used_lds_8252_max_group_size_256:
+; GFX9:       ; Occupancy: 10{{$}}
+; GFX1010W64:    ; Occupancy: 20{{$}}
+; GFX1010W32:    ; Occupancy: 20{{$}}
+define amdgpu_kernel void @used_lds_8252_max_group_size_256() #7 {
+  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
+  store volatile i8 1, i8 addrspace(3)* %p
+  ret void
+}
+
+; GCN-LABEL: {{^}}used_lds_8252_max_group_size_512:
+; GFX9:       ; Occupancy: 10{{$}}
+; GFX1010W64:    ; Occupancy: 20{{$}}
+; GFX1010W32:    ; Occupancy: 20{{$}}
+define amdgpu_kernel void @used_lds_8252_max_group_size_512() #8 {
+  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
+  store volatile i8 1, i8 addrspace(3)* %p
+  ret void
+}
+
+; GCN-LABEL: {{^}}used_lds_8252_max_group_size_1024:
+; GFX9:       ; Occupancy: 10{{$}}
+; GFX1010W64:    ; Occupancy: 20{{$}}
+; GFX1010W32:    ; Occupancy: 20{{$}}
+define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 {
+  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
+  store volatile i8 1, i8 addrspace(3)* %p
+  ret void
+}
+
+; GCN-LABEL: {{^}}used_lds_8252_max_group_size_32:
+; GFX9:       ; Occupancy: 7{{$}}
+; GFX1010W64:    ; Occupancy: 7{{$}}
+; GFX1010W32:    ; Occupancy: 7{{$}}
+define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 {
+  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
+  store volatile i8 1, i8 addrspace(3)* %p
+  ret void
+}
+
 attributes #0 = { "amdgpu-waves-per-eu"="2,3" }
 attributes #1 = { "amdgpu-waves-per-eu"="18,18" }
 attributes #2 = { "amdgpu-waves-per-eu"="19,19" }
+attributes #3 = { "amdgpu-flat-work-group-size"="1,64" }
+attributes #4 = { "amdgpu-flat-work-group-size"="1,96" }
+attributes #5 = { "amdgpu-flat-work-group-size"="1,128" }
+attributes #6 = { "amdgpu-flat-work-group-size"="1,192" }
+attributes #7 = { "amdgpu-flat-work-group-size"="1,256" }
+attributes #8 = { "amdgpu-flat-work-group-size"="1,512" }
+attributes #9 = { "amdgpu-flat-work-group-size"="1,1024" }
+attributes #10 = { "amdgpu-flat-work-group-size"="1,32" }
-- 
2.7.4