From 8382ce5f1b099e4cf8b1e15fe9efb6963740b6cc Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 12 Sep 2019 23:46:46 +0000 Subject: [PATCH] AMDGPU: Inline constant when materalizing FI with add on gfx9 This was relying on the SGPR usable for the carry out clobber to also be used for the input. There was no carry out on gfx9. With no carry out clobber to worry about, so the literal can just be directly used with a VOP2 add. llvm-svn: 371791 --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 7 +++- .../test/CodeGen/AMDGPU/frame-index-elimination.ll | 8 ++-- .../test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir | 43 ++++++++++++++++++++++ 4 files changed, 53 insertions(+), 7 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index fe57435..3386f80 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6110,7 +6110,7 @@ MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, Register DestReg, RegScavenger &RS) const { if (ST.hasAddNoCarry()) - return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); + return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg); Register UnusedCarry = RS.scavengeRegister(RI.getBoolRC(), I, 0, false); // TODO: Users need to deal with this. diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 235de00..7967d9c 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1285,12 +1285,15 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, .addImm(ST.getWavefrontSizeLog2()) .addReg(DiffReg, RegState::Kill); + const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; + // TODO: Fold if use instruction is another add of a constant. - if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { + if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { // FIXME: This can fail MIB.addImm(Offset); MIB.addReg(ScaledReg, RegState::Kill); - MIB.addImm(0); // clamp bit + if (!IsVOP2) + MIB.addImm(0); // clamp bit } else { Register ConstOffsetReg = RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MIB, 0, false); diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll index 0d8e745..b25a40f 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -176,13 +176,13 @@ ret: ; Added offset can't be used with VOP3 add ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32: ; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; GCN-DAG: s_movk_i32 [[K:s[0-9]+|vcc_lo|vcc_hi]], 0x200 +; CI-DAG: s_movk_i32 [[K:s[0-9]+|vcc_lo|vcc_hi]], 0x200 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6 ; CI: v_add_i32_e32 [[VZ:v[0-9]+]], vcc, [[K]], [[SCALED]] ; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]] -; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], [[K]], [[SCALED]] +; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]] ; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]] ; GCN: ds_write_b32 v0, [[VZ]] @@ -200,13 +200,13 @@ define void @func_other_fi_user_non_inline_imm_offset_i32() #0 { ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32_vcc_live: ; GCN: s_sub_u32 [[DIFF:s[0-9]+]], s32, s33 -; GCN-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200 +; CI-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[DIFF]], 6 ; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]] ; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[DIFF]] -; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], [[OFFSET]], [[SCALED]] +; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]] ; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]] ; GCN: ds_write_b32 v0, [[VZ]] diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir new file mode 100644 index 0000000..90afb18 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir @@ -0,0 +1,43 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck %s + +# Test what happens when an SGPR is unavailable for the unused add. The non-inline constant needs to be folded into the add instruction and not materialized in a register. + +--- +name: scavenge_sgpr_pei_no_sgprs +tracksRegLiveness: true + +stack: + - { id: 0, type: default, offset: 0, size: 4, alignment: 8192 } + - { id: 1, type: default, offset: 0, size: 4, alignment: 8192 } + +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + scratchWaveOffsetReg: $sgpr34 + frameOffsetReg: $sgpr33 + stackPtrOffsetReg: $sgpr32 + +body: | + bb.0: + liveins: $vgpr1 + + ; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs + ; CHECK: liveins: $vgpr1 + ; CHECK: $sgpr27 = frame-setup COPY $sgpr33 + ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc + ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc + ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc + ; CHECK: $sgpr33 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc + ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, killed $sgpr33, implicit $exec + ; CHECK: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec + ; CHECK: $sgpr33 = S_ADD_U32 $sgpr33, $sgpr34, implicit-def $scc + ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc + ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 + ; CHECK: S_ENDPGM 0, implicit $vcc + S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc + $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + S_ENDPGM 0, implicit $vcc +... -- 2.7.4