From e7f080b3598d7d73456954554d173146b2744953 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Wed, 18 Jan 2023 11:58:50 -0800 Subject: [PATCH] [AMDGPU] Introduce separate register limit bias in scheduler Current implementation abuses ErrorMargin to apply an additional bias to VGPR and SGPR limits under a high register pressure. The ErrorMargin exists to account for inaccuracies of the RP tracker and not to tackle an excess pressure. Introduce separate bias for this purpose and also make it different for SGPRs and VGPRs as we may want to use different values in the future. This is supposed to be NFC, however there is a subtle difference when subtracting a margin overflows the limit. Doing two subtractions makes it less probable, although manifests only in mir tests with an artificially small register budget. Differential Revision: https://reviews.llvm.org/D142051 --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 13 ++++++++++--- llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 12 +++++++++--- .../AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll | 4 ++-- llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll | 2 +- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 5e55186..ae119e0 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -73,12 +73,18 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { VGPRCriticalLimit = std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit); - // Subtract error margin from register limits and avoid overflow. + // Subtract error margin and bias from register limits and avoid overflow. + SGPRCriticalLimit = + std::min(SGPRCriticalLimit - SGPRLimitBias, SGPRCriticalLimit); SGPRCriticalLimit = std::min(SGPRCriticalLimit - ErrorMargin, SGPRCriticalLimit); VGPRCriticalLimit = + std::min(VGPRCriticalLimit - VGPRLimitBias, VGPRCriticalLimit); + VGPRCriticalLimit = std::min(VGPRCriticalLimit - ErrorMargin, VGPRCriticalLimit); + SGPRExcessLimit = std::min(SGPRExcessLimit - SGPRLimitBias, SGPRExcessLimit); SGPRExcessLimit = std::min(SGPRExcessLimit - ErrorMargin, SGPRExcessLimit); + VGPRExcessLimit = std::min(VGPRExcessLimit - VGPRLimitBias, VGPRExcessLimit); VGPRExcessLimit = std::min(VGPRExcessLimit - ErrorMargin, VGPRExcessLimit); } @@ -670,7 +676,8 @@ bool UnclusteredHighRPStage::initGCNSchedStage() { InitialOccupancy = DAG.MinOccupancy; // Aggressivly try to reduce register pressure in the unclustered high RP // stage. Temporarily increase occupancy target in the region. - S.ErrorMargin = S.HighRPErrorMargin; + S.SGPRLimitBias = S.HighRPSGPRBias; + S.VGPRLimitBias = S.HighRPVGPRBias; if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); @@ -735,7 +742,7 @@ void GCNSchedStage::finalizeGCNSchedStage() { void UnclusteredHighRPStage::finalizeGCNSchedStage() { SavedMutations.swap(DAG.Mutations); - S.ErrorMargin = S.DefaultErrorMargin; + S.SGPRLimitBias = S.VGPRLimitBias = 0; if (DAG.MinOccupancy > InitialOccupancy) { for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) DAG.RegionsWithMinOcc[IDX] = diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index d7b161c..30a62f61 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -78,16 +78,22 @@ public: // An error margin is necessary because of poor performance of the generic RP // tracker and can be adjusted up for tuning heuristics to try and more // aggressively reduce register pressure. - const unsigned DefaultErrorMargin = 3; + unsigned ErrorMargin = 3; - const unsigned HighRPErrorMargin = 10; + // Bias for SGPR limits under a high register pressure. + const unsigned HighRPSGPRBias = 7; - unsigned ErrorMargin = DefaultErrorMargin; + // Bias for VGPR limits under a high register pressure. + const unsigned HighRPVGPRBias = 7; unsigned SGPRCriticalLimit; unsigned VGPRCriticalLimit; + unsigned SGPRLimitBias = 0; + + unsigned VGPRLimitBias = 0; + GCNSchedStrategy(const MachineSchedContext *C); SUnit *pickNode(bool &IsTopNode) override; diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 29876a2..785f8ce 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -18,8 +18,8 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]] ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX908-NEXT: [[COPY2:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 + ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec ; REGALLOC-GFX908-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) @@ -43,8 +43,8 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec ; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; PEI-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; PEI-GFX908-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + ; PEI-GFX908-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec ; PEI-GFX908-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll index e0e8dc8..7939798 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -8,9 +8,9 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { ; GCN-NEXT: liveins: $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) + ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec - ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1769482 /* regdef:VGPR_32 */, def undef %22.sub0 ; GCN-NEXT: undef %24.sub0:av_64 = COPY %22.sub0 -- 2.7.4