From 88aced1e454195e038560abb3a0732d020aa4295 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 2 Mar 2020 09:43:06 -0500 Subject: [PATCH] AMDGPU: Fix computation for getOccupancyWithLocalMemSize The computation here didn't really make sense to me, and reported wildy different results depending on the flat work group size attribute. I think this should really report a range derived from the possible work group size bounds, and only allow an occupancy that is a multiple of the group size. --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 41 ++++++++++--- llvm/test/CodeGen/AMDGPU/occupancy-levels.ll | 89 ++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index fb488d2..248c3cb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -328,18 +328,41 @@ unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; } +// FIXME: Should return min,max range. unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &F) const { - unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; - unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); - if (!WorkGroupsPerCu) + const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; + const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); + if (!MaxWorkGroupsPerCu) return 0; - unsigned MaxWaves = getMaxWavesPerEU(); - unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; - unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); - NumWaves = std::min(NumWaves, MaxWaves); - NumWaves = std::max(NumWaves, 1u); - return NumWaves; + + const unsigned WaveSize = getWavefrontSize(); + + // FIXME: Do we need to account for alignment requirement of LDS rounding the + // size up? + // Compute restriction based on LDS usage + unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); + + // This can be queried with more LDS than is possible, so just assume the + // worst. + if (NumGroups == 0) + return 1; + + NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); + + // Round to the number of waves. + const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; + unsigned MaxWaves = NumGroups * MaxGroupNumWaves; + + // Clamp to the maximum possible number of waves. + MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); + + // FIXME: Needs to be a multiple of the group size? + //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); + + assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && + "computed invalid occupancy"); + return MaxWaves; } unsigned diff --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll index eae3f11..db70c3d 100644 --- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll +++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll @@ -283,6 +283,95 @@ define amdgpu_kernel void @used_lds_13112() { ret void } +; GCN-LABEL: {{^}}used_lds_8252_max_group_size_64: +; GFX9: ; Occupancy: 7{{$}} +; GFX101064: ; Occupancy: 7{{$}} +; GFX1010W32: ; Occupancy: 14{{$}} +@lds8252 = internal addrspace(3) global [8252 x i8] undef, align 4 +define amdgpu_kernel void @used_lds_8252_max_group_size_64() #3 { + %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)* + store volatile i8 1, i8 addrspace(3)* %p + ret void +} + +; GCN-LABEL: {{^}}used_lds_8252_max_group_size_96: +; GFX9: ; Occupancy: 10{{$}} +; GFX1010W64: ; Occupancy: 14{{$}} +; GFX1010W32: ; Occupancy: 20{{$}} +define amdgpu_kernel void @used_lds_8252_max_group_size_96() #4 { + %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)* + store volatile i8 1, i8 addrspace(3)* %p + ret void +} + +; GCN-LABEL: {{^}}used_lds_8252_max_group_size_128: +; GFX9: ; Occupancy: 10{{$}} +; GFX1010W64: ; Occupancy: 14{{$}} +; GFX1010W32: ; Occupancy: 20{{$}} +define amdgpu_kernel void @used_lds_8252_max_group_size_128() #5 { + %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)* + store volatile i8 1, i8 addrspace(3)* %p + ret void +} + +; GCN-LABEL: {{^}}used_lds_8252_max_group_size_192: +; GFX9: ; Occupancy: 10{{$}} +; GFX1010W64: ; Occupancy: 20{{$}} +; GFX1010W32: ; Occupancy: 20{{$}} +define amdgpu_kernel void @used_lds_8252_max_group_size_192() #6 { + %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)* + store volatile i8 1, i8 addrspace(3)* %p + ret void +} + +; GCN-LABEL: {{^}}used_lds_8252_max_group_size_256: +; GFX9: ; Occupancy: 10{{$}} +; GFX1010W64: ; Occupancy: 20{{$}} +; GFX1010W32: ; Occupancy: 20{{$}} +define amdgpu_kernel void @used_lds_8252_max_group_size_256() #7 { + %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)* + store volatile i8 1, i8 addrspace(3)* %p + ret void +} + +; GCN-LABEL: {{^}}used_lds_8252_max_group_size_512: +; GFX9: ; Occupancy: 10{{$}} +; GFX1010W64: ; Occupancy: 20{{$}} +; GFX1010W32: ; Occupancy: 20{{$}} +define amdgpu_kernel void @used_lds_8252_max_group_size_512() #8 { + %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)* + store volatile i8 1, i8 addrspace(3)* %p + ret void +} + +; GCN-LABEL: {{^}}used_lds_8252_max_group_size_1024: +; GFX9: ; Occupancy: 10{{$}} +; GFX1010W64: ; Occupancy: 20{{$}} +; GFX1010W32: ; Occupancy: 20{{$}} +define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 { + %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)* + store volatile i8 1, i8 addrspace(3)* %p + ret void +} + +; GCN-LABEL: {{^}}used_lds_8252_max_group_size_32: +; GFX9: ; Occupancy: 7{{$}} +; GFX1010W64: ; Occupancy: 7{{$}} +; GFX1010W32: ; Occupancy: 7{{$}} +define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 { + %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)* + store volatile i8 1, i8 addrspace(3)* %p + ret void +} + attributes #0 = { "amdgpu-waves-per-eu"="2,3" } attributes #1 = { "amdgpu-waves-per-eu"="18,18" } attributes #2 = { "amdgpu-waves-per-eu"="19,19" } +attributes #3 = { "amdgpu-flat-work-group-size"="1,64" } +attributes #4 = { "amdgpu-flat-work-group-size"="1,96" } +attributes #5 = { "amdgpu-flat-work-group-size"="1,128" } +attributes #6 = { "amdgpu-flat-work-group-size"="1,192" } +attributes #7 = { "amdgpu-flat-work-group-size"="1,256" } +attributes #8 = { "amdgpu-flat-work-group-size"="1,512" } +attributes #9 = { "amdgpu-flat-work-group-size"="1,1024" } +attributes #10 = { "amdgpu-flat-work-group-size"="1,32" } -- 2.7.4