From 2594fa85932a68a904cdb5445dbf7aa231c66e9b Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Wed, 31 Jul 2019 01:07:10 +0000 Subject: [PATCH] [AMDGPU] Fix high occupancy calculation and print it We had couple places which still return 10 as a maximum occupancy. Fixed. Also print comment about occupancy as compiler see it. Differential Revision: https://reviews.llvm.org/D65423 llvm-svn: 367381 --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 8 + llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 18 +- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 21 +- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 3 +- llvm/lib/Target/AMDGPU/SIProgramInfo.h | 3 + .../AMDGPU/hsa-metadata-kernel-code-props-v3.ll | 5 +- llvm/test/CodeGen/AMDGPU/nsa-reassign.ll | 4 +- llvm/test/CodeGen/AMDGPU/occupancy-levels.ll | 288 +++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/wave32.ll | 2 +- 9 files changed, 335 insertions(+), 17 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/occupancy-levels.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 743ac64..a429c7c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -507,6 +507,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); OutStreamer->emitRawComment( + " Occupancy: " + + Twine(CurrentProgramInfo.Occupancy), false); + + OutStreamer->emitRawComment( " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); OutStreamer->emitRawComment( @@ -1057,6 +1061,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) | S_00B84C_EXCP_EN(0); + + ProgInfo.Occupancy = STM.computeOccupancy(MF, ProgInfo.LDSSize, + ProgInfo.NumSGPRsForWavesPerEU, + ProgInfo.NumVGPRsForWavesPerEU); } static unsigned getRsrcReg(CallingConv::ID CallConv) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 716c387..f9a9679 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -175,6 +175,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : HasFminFmaxLegacy(true), EnablePromoteAlloca(false), HasTrigReducedRange(false), + MaxWavesPerEU(10), LocalMemorySize(0), WavefrontSize(0) { } @@ -278,6 +279,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), TLInfo(TM, *this), FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { + MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); @@ -566,7 +568,7 @@ bool GCNSubtarget::hasMadF16() const { unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { if (getGeneration() >= AMDGPUSubtarget::GFX10) - return 10; + return getMaxWavesPerEU(); if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { if (SGPRs <= 80) @@ -616,6 +618,20 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { return 2; // VCC. } +unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF, + unsigned LDSSize, + unsigned NumSGPRs, + unsigned NumVGPRs) const { + unsigned Occupancy = + std::min(getMaxWavesPerEU(), + getOccupancyWithLocalMemSize(LDSSize, MF.getFunction())); + if (NumSGPRs) + Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); + if (NumVGPRs) + Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); + return Occupancy; +} + unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { const Function &F = MF.getFunction(); const SIMachineFunctionInfo &MFI = *MF.getInfo(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 1f30d76..bc10091 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -75,6 +75,7 @@ protected: bool HasFminFmaxLegacy; bool EnablePromoteAlloca; bool HasTrigReducedRange; + unsigned MaxWavesPerEU; int LocalMemorySize; unsigned WavefrontSize; @@ -223,7 +224,9 @@ public: /// subtarget. virtual unsigned getMinWavesPerEU() const = 0; - unsigned getMaxWavesPerEU() const { return 10; } + /// \returns Maximum number of waves per execution unit supported by the + /// subtarget without any kind of limitation. + unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; } /// Creates value range metadata on an workitemid.* inrinsic call or load. bool makeLIDRangeMetadata(Instruction *I) const; @@ -245,6 +248,9 @@ public: class GCNSubtarget : public AMDGPUGenSubtargetInfo, public AMDGPUSubtarget { + + using AMDGPUSubtarget::getMaxWavesPerEU; + public: enum TrapHandlerAbi { TrapHandlerAbiNone = 0, @@ -881,12 +887,6 @@ public: return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize); } - /// \returns Maximum number of waves per execution unit supported by the - /// subtarget without any kind of limitation. - unsigned getMaxWavesPerEU() const { - return AMDGPU::IsaInfo::getMaxWavesPerEU(this); - } - /// \returns Number of waves per work group supported by the subtarget and /// limited by given \p FlatWorkGroupSize. unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { @@ -1036,6 +1036,13 @@ public: /// VGPRs unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; + /// Return occupancy for the given function. Used LDS and a number of + /// registers if provided. + /// Note, occupancy can be affected by the scratch allocation as well, but + /// we do not have enough information to compute it. + unsigned computeOccupancy(const MachineFunction &MF, unsigned LDSSize = 0, + unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; + /// \returns true if the flat_scratch register should be initialized with the /// pointer to the wave's scratch memory rather than a size and offset. bool flatScratchIsPointer() const { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 46da974..d9068d6 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -53,8 +53,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); WavesPerEU = ST.getWavesPerEU(F); - Occupancy = getMaxWavesPerEU(); - limitOccupancy(MF); + Occupancy = ST.computeOccupancy(MF, getLDSSize()); CallingConv::ID CC = F.getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h index 168f05f..94ebe693 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -51,6 +51,9 @@ struct SIProgramInfo { // Number of VGPRs that meets number of waves per execution unit request. uint32_t NumVGPRsForWavesPerEU = 0; + // Final occupancy. + uint32_t Occupancy = 0; + // Whether there is recursion, dynamic allocas, indirect calls or some other // reason there may be statically unknown stack usage. bool DynamicCallStack = false; diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll index 2d2f9ce..0eed325 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll @@ -74,10 +74,7 @@ entry: ; CHECK: .name: num_spilled_vgprs ; CHECK: .symbol: num_spilled_vgprs.kd -; GFX700: .vgpr_spill_count: 14 -; GFX803: .vgpr_spill_count: 14 -; GFX900: .vgpr_spill_count: 14 -; GFX1010: .vgpr_spill_count: 0 +; CHECK: .vgpr_spill_count: 14 define amdgpu_kernel void @num_spilled_vgprs() #1 { %val0 = load volatile float, float addrspace(1)* @var %val1 = load volatile float, float addrspace(1)* @var diff --git a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll index 011aef8..a668a19 100644 --- a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll +++ b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll @@ -21,8 +21,8 @@ main_body: } ; GCN-LABEL: {{^}}sample_contig_nsa_10vgprs: -; GCN-DAG: image_sample_c_l v{{[0-9]+}}, v[{{[0-9:]+}}], -; GCN-DAG: image_sample v{{[0-9]+}}, v[{{[0-9:]+}}], +; GCN-DAG: image_sample_c_l v{{[0-9]+}}, [{{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}], +; GCN-DAG: image_sample v{{[0-9]+}}, [{{v[0-9]+, v[0-9]+, v[0-9]+}}], define amdgpu_ps <2 x float> @sample_contig_nsa_10vgprs(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) #0 { main_body: %zcompare.1 = fadd float %zcompare, 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll new file mode 100644 index 0000000..4f509c0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll @@ -0,0 +1,288 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GCN,GFX1010,GFX1010W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX1010,GFX1010W64 %s + +; GCN-LABEL: {{^}}max_occupancy: +; GFX9: ; Occupancy: 10 +; GFX1010: ; Occupancy: 20 +define amdgpu_kernel void @max_occupancy() { + ret void +} + +; GCN-LABEL: {{^}}limited_occupancy_3: +; GFX9: ; Occupancy: 3 +; GFX1010W64: ; Occupancy: 3 +; GFX1010W32: ; Occupancy: 4 +define amdgpu_kernel void @limited_occupancy_3() #0 { + ret void +} + +; GCN-LABEL: {{^}}limited_occupancy_18: +; GFX9: ; Occupancy: 10 +; GFX1010: ; Occupancy: 18 +define amdgpu_kernel void @limited_occupancy_18() #1 { + ret void +} + +; GCN-LABEL: {{^}}limited_occupancy_19: +; GFX9: ; Occupancy: 10 +; GFX1010: ; Occupancy: 18 +define amdgpu_kernel void @limited_occupancy_19() #2 { + ret void +} + +; GCN-LABEL: {{^}}used_24_vgprs: +; GFX9: ; Occupancy: 10 +; GFX1010: ; Occupancy: 20 +define amdgpu_kernel void @used_24_vgprs() { + call void asm sideeffect "", "~{v23}" () + ret void +} + +; GCN-LABEL: {{^}}used_28_vgprs: +; GFX9: ; Occupancy: 9 +; GFX1010W64: ; Occupancy: 18 +; GFX1010W32: ; Occupancy: 20 +define amdgpu_kernel void @used_28_vgprs() { + call void asm sideeffect "", "~{v27}" () + ret void +} + +; GCN-LABEL: {{^}}used_32_vgprs: +; GFX9: ; Occupancy: 8 +; GFX1010W64: ; Occupancy: 16 +; GFX1010W32: ; Occupancy: 20 +define amdgpu_kernel void @used_32_vgprs() { + call void asm sideeffect "", "~{v31}" () + ret void +} + +; GCN-LABEL: {{^}}used_36_vgprs: +; GFX9: ; Occupancy: 7 +; GFX1010W64: ; Occupancy: 14 +; GFX1010W32: ; Occupancy: 20 +define amdgpu_kernel void @used_36_vgprs() { + call void asm sideeffect "", "~{v35}" () + ret void +} + +; GCN-LABEL: {{^}}used_40_vgprs: +; GFX9: ; Occupancy: 6 +; GFX1010W64: ; Occupancy: 12 +; GFX1010W32: ; Occupancy: 20 +define amdgpu_kernel void @used_40_vgprs() { + call void asm sideeffect "", "~{v39}" () + ret void +} + +; GCN-LABEL: {{^}}used_44_vgprs: +; GFX9: ; Occupancy: 5 +; GFX1010W64: ; Occupancy: 11 +; GFX1010W32: ; Occupancy: 20 +define amdgpu_kernel void @used_44_vgprs() { + call void asm sideeffect "", "~{v43}" () + ret void +} + +; GCN-LABEL: {{^}}used_48_vgprs: +; GFX9: ; Occupancy: 5 +; GFX1010W64: ; Occupancy: 10 +; GFX1010W32: ; Occupancy: 20 +define amdgpu_kernel void @used_48_vgprs() { + call void asm sideeffect "", "~{v47}" () + ret void +} + +; GCN-LABEL: {{^}}used_56_vgprs: +; GFX9: ; Occupancy: 4 +; GFX1010W64: ; Occupancy: 9 +; GFX1010W32: ; Occupancy: 18 +define amdgpu_kernel void @used_56_vgprs() { + call void asm sideeffect "", "~{v55}" () + ret void +} + +; GCN-LABEL: {{^}}used_64_vgprs: +; GFX9: ; Occupancy: 4 +; GFX1010W64: ; Occupancy: 8 +; GFX1010W32: ; Occupancy: 16 +define amdgpu_kernel void @used_64_vgprs() { + call void asm sideeffect "", "~{v63}" () + ret void +} + +; GCN-LABEL: {{^}}used_72_vgprs: +; GFX9: ; Occupancy: 3 +; GFX1010W64: ; Occupancy: 7 +; GFX1010W32: ; Occupancy: 14 +define amdgpu_kernel void @used_72_vgprs() { + call void asm sideeffect "", "~{v71}" () + ret void +} + +; GCN-LABEL: {{^}}used_80_vgprs: +; GFX9: ; Occupancy: 3 +; GFX1010W64: ; Occupancy: 6 +; GFX1010W32: ; Occupancy: 12 +define amdgpu_kernel void @used_80_vgprs() { + call void asm sideeffect "", "~{v79}" () + ret void +} + +; GCN-LABEL: {{^}}used_84_vgprs: +; GFX9: ; Occupancy: 3 +; GFX1010W64: ; Occupancy: 6 +; GFX1010W32: ; Occupancy: 11 +define amdgpu_kernel void @used_84_vgprs() { + call void asm sideeffect "", "~{v83}" () + ret void +} + +; GCN-LABEL: {{^}}used_88_vgprs: +; GFX9: ; Occupancy: 2 +; GFX1010W64: ; Occupancy: 5 +; GFX1010W32: ; Occupancy: 11 +define amdgpu_kernel void @used_88_vgprs() { + call void asm sideeffect "", "~{v87}" () + ret void +} + +; GCN-LABEL: {{^}}used_96_vgprs: +; GFX9: ; Occupancy: 2 +; GFX1010W64: ; Occupancy: 5 +; GFX1010W32: ; Occupancy: 10 +define amdgpu_kernel void @used_96_vgprs() { + call void asm sideeffect "", "~{v95}" () + ret void +} + +; GCN-LABEL: {{^}}used_100_vgprs: +; GFX9: ; Occupancy: 2 +; GFX1010W64: ; Occupancy: 5 +; GFX1010W32: ; Occupancy: 9 +define amdgpu_kernel void @used_100_vgprs() { + call void asm sideeffect "", "~{v99}" () + ret void +} + +; GCN-LABEL: {{^}}used_112_vgprs: +; GFX9: ; Occupancy: 2 +; GFX1010W64: ; Occupancy: 4 +; GFX1010W32: ; Occupancy: 9 +define amdgpu_kernel void @used_112_vgprs() { + call void asm sideeffect "", "~{v111}" () + ret void +} + +; GCN-LABEL: {{^}}used_128_vgprs: +; GFX9: ; Occupancy: 2 +; GFX1010W64: ; Occupancy: 4 +; GFX1010W32: ; Occupancy: 8 +define amdgpu_kernel void @used_128_vgprs() { + call void asm sideeffect "", "~{v127}" () + ret void +} + +; GCN-LABEL: {{^}}used_144_vgprs: +; GFX9: ; Occupancy: 1 +; GFX1010W64: ; Occupancy: 3 +; GFX1010W32: ; Occupancy: 7 +define amdgpu_kernel void @used_144_vgprs() { + call void asm sideeffect "", "~{v143}" () + ret void +} + +; GCN-LABEL: {{^}}used_168_vgprs: +; GFX9: ; Occupancy: 1 +; GFX1010W64: ; Occupancy: 3 +; GFX1010W32: ; Occupancy: 6 +define amdgpu_kernel void @used_168_vgprs() { + call void asm sideeffect "", "~{v167}" () + ret void +} + +; GCN-LABEL: {{^}}used_200_vgprs: +; GFX9: ; Occupancy: 1 +; GFX1010W64: ; Occupancy: 2 +; GFX1010W32: ; Occupancy: 5 +define amdgpu_kernel void @used_200_vgprs() { + call void asm sideeffect "", "~{v199}" () + ret void +} + +; GCN-LABEL: {{^}}used_256_vgprs: +; GFX9: ; Occupancy: 1 +; GFX1010W64: ; Occupancy: 2 +; GFX1010W32: ; Occupancy: 4 +define amdgpu_kernel void @used_256_vgprs() { + call void asm sideeffect "", "~{v255}" () + ret void +} + +; GCN-LABEL: {{^}}used_80_sgprs: +; GFX9: ; Occupancy: 10 +; GFX1010: ; Occupancy: 20 +define amdgpu_kernel void @used_80_sgprs() { + call void asm sideeffect "", "~{s79}" () + ret void +} + +; GCN-LABEL: {{^}}used_88_sgprs: +; GFX9: ; Occupancy: 9 +; GFX1010: ; Occupancy: 20 +define amdgpu_kernel void @used_88_sgprs() { + call void asm sideeffect "", "~{s87}" () + ret void +} + +; GCN-LABEL: {{^}}used_100_sgprs: +; GFX9: ; Occupancy: 8 +; GFX1010: ; Occupancy: 20 +define amdgpu_kernel void @used_100_sgprs() { + call void asm sideeffect "", "~{s99}" () + ret void +} + +; GCN-LABEL: {{^}}used_101_sgprs: +; GFX9: ; Occupancy: 7 +; GFX1010: ; Occupancy: 20 +define amdgpu_kernel void @used_101_sgprs() { + call void asm sideeffect "", "~{s100}" () + ret void +} + +; GCN-LABEL: {{^}}used_lds_6552: +; GFX9: ; Occupancy: 10 +; GFX1010: ; Occupancy: 20 +@lds6552 = internal addrspace(3) global [6552 x i8] undef, align 4 +define amdgpu_kernel void @used_lds_6552() { + %p = bitcast [6552 x i8] addrspace(3)* @lds6552 to i8 addrspace(3)* + store volatile i8 1, i8 addrspace(3)* %p + ret void +} + +; GCN-LABEL: {{^}}used_lds_6556: +; GFX9: ; Occupancy: 9 +; GFX1010W64: ; Occupancy: 19 +; GFX1010W32: ; Occupancy: 20 +@lds6556 = internal addrspace(3) global [6556 x i8] undef, align 4 +define amdgpu_kernel void @used_lds_6556() { + %p = bitcast [6556 x i8] addrspace(3)* @lds6556 to i8 addrspace(3)* + store volatile i8 1, i8 addrspace(3)* %p + ret void +} + +; GCN-LABEL: {{^}}used_lds_13112: +; GFX9: ; Occupancy: 4 +; GFX1010W64: ; Occupancy: 9 +; GFX1010W32: ; Occupancy: 19 +@lds13112 = internal addrspace(3) global [13112 x i8] undef, align 4 +define amdgpu_kernel void @used_lds_13112() { + %p = bitcast [13112 x i8] addrspace(3)* @lds13112 to i8 addrspace(3)* + store volatile i8 1, i8 addrspace(3)* %p + ret void +} + +attributes #0 = { "amdgpu-waves-per-eu"="2,3" } +attributes #1 = { "amdgpu-waves-per-eu"="18,18" } +attributes #2 = { "amdgpu-waves-per-eu"="19,19" } diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index f4715a2..a71ca5d 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -920,7 +920,7 @@ main_body: ; GCN-LABEL: {{^}}test_vgprblocks_w64_attr: ; Test that the wave size can be overridden in function attributes and that the block size is correct as a result -; GFX10DEFWAVE: ; VGPRBlocks: 11 +; GFX10DEFWAVE: ; VGPRBlocks: 2 define amdgpu_gs float @test_vgprblocks_w64_attr(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i, float %j, float %k, float %l) #4 { main_body: -- 2.7.4