From 08140d165e3d5b2257b26099469f7ae10c284dc7 Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan Date: Thu, 11 May 2023 21:43:18 +0530 Subject: [PATCH] [AMDGPU] Add optional tied-op for wwm-register's epilog spill restore The COPY inserted in the epilog block before return instruction as part of ABI lowering, can get optimized during machine copy propagation if the same register is used earlier in a wwm operation that demands the prolog/epilog wwm-spill store/restore to preserve its inactive lanes. With the spill restore in the epilog, the preceding COPY appears to be dead during machine-cp. To avoid it, mark the same register as a tied-op in the spill restore instruction to ensure a usage for the COPY. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D150381 --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 27 ++++ .../CodeGen/AMDGPU/preserve-only-inactive-lane.mir | 2 +- .../tied-op-for-wwm-scratch-reg-spill-restore.mir | 142 +++++++++++++++++++++ 3 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 5d7a99a..a3f89c0 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1644,6 +1644,33 @@ void SIRegisterInfo::buildSpillLoadStore( if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg)) MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); + + // The epilog restore of a wwm-scratch register can cause undesired + // optimization during machine-cp post PrologEpilogInserter if the same + // register was assigned for return value ABI lowering with a COPY + // instruction. As given below, with the epilog reload, the earlier COPY + // appeared to be dead during machine-cp. + // ... + // v0 in WWM operation, needs the WWM spill at prolog/epilog. + // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0 + // ... + // Epilog block: + // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0 + // ... + // WWM spill restore to preserve the inactive lanes of v0. + // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1 + // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0 + // $exec = S_MOV_B64 killed $sgpr4_sgpr5 + // ... + // SI_RETURN implicit $vgpr0 + // ... + // To fix it, mark the same reg as a tied op for such restore instructions + // so that it marks a usage for the preceding COPY. + if (!IsStore && MI != MBB.end() && MI->isReturn() && + MI->readsRegister(SubReg, this)) { + MIB.addReg(SubReg, RegState::Implicit); + MIB->tieOperands(0, MIB->getNumOperands() - 1); + } } if (ScratchOffsetRegDelta != 0) { diff --git a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir index f06133c..2f36bba 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir +++ b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir @@ -28,7 +28,7 @@ body: | ; GCN-NEXT: $sgpr35 = V_READLANE_B32 $vgpr0, 0 ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31, implicit $vgpr0 renamable $vgpr0 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir new file mode 100644 index 0000000..c4fde5d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir @@ -0,0 +1,142 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -run-pass=prologepilog,machine-cp -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +# The COPY that moves the return value to VGPR0 should not be removed during machine-cp. The spill restore of the same register that follows, +# meant to only reload its inactive lanes. By marking the reg itself as the tied-op in the spill reload prevents the undesired optimization. + +--- +name: wwm_scratch_reg_spill_reload_of_outgoing_reg +tracksRegLiveness: true +machineFunctionInfo: + wwmReservedRegs: ['$vgpr0'] + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' +body: | + bb.0: + liveins: $sgpr20, $vgpr1 + ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_of_outgoing_reg + ; GCN: liveins: $sgpr20, $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0 + ; GCN-NEXT: $vgpr0 = COPY killed renamable $vgpr1, implicit $exec + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: SI_RETURN implicit $vgpr0 + $vgpr0 = IMPLICIT_DEF + $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0 + $vgpr0 = COPY killed renamable $vgpr1, implicit $exec + SI_RETURN implicit $vgpr0 +... + +# The reload of vgpr0 require the tied-op as it is a subreg in the outgoing tuple register vgpr0_vgpr1. +# The vgpr2 doesn't need the tied-op in the reload as it isn't holding any return value. +--- +name: wwm_scratch_reg_spill_reload_of_outgoing_tuple_subreg +tracksRegLiveness: true +machineFunctionInfo: + wwmReservedRegs: ['$vgpr0', '$vgpr2'] + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' +body: | + bb.0: + liveins: $sgpr20, $sgpr21, $vgpr1 + ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_of_outgoing_tuple_subreg + ; GCN: liveins: $sgpr20, $sgpr21, $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0 + ; GCN-NEXT: $vgpr2 = V_WRITELANE_B32 killed $sgpr21, 0, $vgpr2 + ; GCN-NEXT: $vgpr0 = COPY $vgpr1, implicit $exec + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1 + $vgpr0 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0 + $vgpr2 = V_WRITELANE_B32 killed $sgpr21, 0, $vgpr2 + $vgpr0 = COPY $vgpr1, implicit $exec + SI_RETURN implicit $vgpr0_vgpr1 +... + +# Tied op not required in the spill reload of vgpr2. + +--- +name: wwm_scratch_reg_spill_reload_different_outgoing_reg +tracksRegLiveness: true +machineFunctionInfo: + wwmReservedRegs: ['$vgpr2'] + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' +body: | + bb.0: + liveins: $sgpr20, $vgpr1 + ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_different_outgoing_reg + ; GCN: liveins: $sgpr20, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr2 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr2 + ; GCN-NEXT: $vgpr0 = COPY $vgpr1, implicit $exec + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1 + $vgpr2 = IMPLICIT_DEF + $vgpr2 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr2 + $vgpr0 = COPY $vgpr1, implicit $exec + SI_RETURN implicit $vgpr0_vgpr1 +... + +# Tied op not required in the spill reload of vgpr40 which is in the CSR range. +--- +name: wwm_csr_spill_reload +tracksRegLiveness: true +machineFunctionInfo: + wwmReservedRegs: ['$vgpr40'] + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' +body: | + bb.0: + liveins: $sgpr20, $vgpr1 + ; GCN-LABEL: name: wwm_csr_spill_reload + ; GCN: liveins: $sgpr20, $vgpr1, $vgpr40 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: $vgpr40 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr40 + ; GCN-NEXT: $sgpr20 = V_READLANE_B32 $vgpr40, 0, implicit $exec + ; GCN-NEXT: $vgpr0 = COPY killed $vgpr1, implicit $exec + ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: SI_RETURN implicit $vgpr0 + $vgpr40 = IMPLICIT_DEF + $vgpr40 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr40 + $sgpr20 = V_READLANE_B32 $vgpr40, 0, implicit $exec + $vgpr0 = COPY killed $vgpr1, implicit $exec + SI_RETURN implicit $vgpr0 +... -- 2.7.4