From 5db8d6fd2bd6db3cdc6290b538f054ca0833d763 Mon Sep 17 00:00:00 2001 From: Ivan Kosarev Date: Mon, 5 Sep 2022 14:13:12 +0100 Subject: [PATCH] [AMDGPU][CodeGen] Support (base | offset) SMEM loads. Prevents generation of unnecessary s_or_b32 instructions. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D132552 --- llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp | 9 ++++++++- llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h | 4 +++- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 3 ++- .../lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 21 ++++++++++----------- .../CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll | 7 ++----- 5 files changed, 25 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp index 1bbdc39..4b71d11 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp @@ -8,6 +8,7 @@ #include "AMDGPUGlobalISelUtils.h" #include "GCNSubtarget.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/IR/Constants.h" #include "llvm/Support/LowLevelTypeImpl.h" @@ -16,7 +17,8 @@ using namespace llvm; using namespace MIPatternMatch; std::pair -AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { +AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, + GISelKnownBits *KnownBits) { MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); if (!Def) return std::make_pair(Reg, 0); @@ -43,6 +45,11 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { return std::make_pair(Def->getOperand(1).getReg(), Offset); } + Register Base; + if (KnownBits && mi_match(Reg, MRI, m_GOr(m_Reg(Base), m_ICst(Offset))) && + KnownBits->maskedValueIsZero(Base, APInt(32, Offset))) + return std::make_pair(Base, Offset); + // Handle G_PTRTOINT (G_PTR_ADD base, const) case if (Def->getOpcode() == TargetOpcode::G_PTRTOINT) { MachineInstr *Base; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h index 5c600d0..9f7c00b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h @@ -17,13 +17,15 @@ namespace llvm { class MachineRegisterInfo; class GCNSubtarget; +class GISelKnownBits; class LLT; namespace AMDGPU { /// Returns base register and constant offset. std::pair -getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg); +getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, + GISelKnownBits *KnownBits = nullptr); bool isLegalVOP3PShuffleMask(ArrayRef Mask); bool hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget, const LLT &Ty); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 5972e90..8425948 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1987,7 +1987,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, // A 32-bit (address + offset) should not cause unsigned 32-bit integer // wraparound, because s_load instructions perform the addition in 64 bits. - if (Addr.getValueType() == MVT::i32 && !Addr->getFlags().hasNoUnsignedWrap()) + if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD && + !Addr->getFlags().hasNoUnsignedWrap()) return false; SDValue N0, N1; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index d85bf107f..1fa7596 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1438,7 +1438,7 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, .addImm(0); } else { std::tie(BaseOffset, ImmOffset) = - AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); + AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KnownBits); if (Readfirstlane) { // We have the constant offset now, so put the readfirstlane back on the @@ -2654,15 +2654,14 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { /// Return the register to use for the index value, and the subregister to use /// for the indirectly accessed register. static std::pair -computeIndirectRegIndex(MachineRegisterInfo &MRI, - const SIRegisterInfo &TRI, - const TargetRegisterClass *SuperRC, - Register IdxReg, - unsigned EltSize) { +computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, + const TargetRegisterClass *SuperRC, Register IdxReg, + unsigned EltSize, GISelKnownBits &KnownBits) { Register IdxBaseReg; int Offset; - std::tie(IdxBaseReg, Offset) = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); + std::tie(IdxBaseReg, Offset) = + AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &KnownBits); if (IdxBaseReg == AMDGPU::NoRegister) { // This will happen if the index is a known constant. This should ordinarily // be legalized out, but handle it as a register just in case. @@ -2713,8 +2712,8 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( const bool Is64 = DstTy.getSizeInBits() == 64; unsigned SubReg; - std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, - DstTy.getSizeInBits() / 8); + std::tie(IdxReg, SubReg) = computeIndirectRegIndex( + *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KnownBits); if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { if (DstTy.getSizeInBits() != 32 && !Is64) @@ -2795,7 +2794,7 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( unsigned SubReg; std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, - ValSize / 8); + ValSize / 8, *KnownBits); const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && STI.useVGPRIndexMode(); @@ -4918,7 +4917,7 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const { Register SOffset; unsigned Offset; std::tie(SOffset, Offset) = - AMDGPU::getBaseWithConstantOffset(*MRI, Root.getReg()); + AMDGPU::getBaseWithConstantOffset(*MRI, Root.getReg(), KnownBits); if (!SOffset) return None; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll index b10980f..2d04e10 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll @@ -110,7 +110,6 @@ define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset(<4 x i32> inreg %ba ret void } -; TODO: Select S_BUFFER_LOAD_DWORD_SGPR_IMM here. ; GCN-LABEL: name: test_buffer_load_sgpr_or_imm_offset ; SDAG-DAG: %[[BASE0:.*]]:sgpr_32 = COPY $sgpr0 ; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1 @@ -118,18 +117,16 @@ define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset(<4 x i32> inreg %ba ; SDAG-DAG: %[[BASE3:.*]]:sgpr_32 = COPY $sgpr3 ; SDAG-DAG: %[[INDEX:.*]]:sgpr_32 = COPY $sgpr4 ; SDAG-DAG: %[[SHIFT:.*]]:sreg_32 = S_LSHL_B32 %[[INDEX]], -; SDAG-DAG: %[[OR:.*]]:sreg_32 = S_OR_B32 killed %[[SHIFT]], ; SDAG-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3 -; SDAG: S_BUFFER_LOAD_DWORD_SGPR killed %[[BASE]], killed %[[OR]], 0 +; SDAG: S_BUFFER_LOAD_DWORD_SGPR_IMM killed %[[BASE]], killed %[[SHIFT]], 5, ; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0 ; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1 ; GISEL-DAG: %[[BASE2:.*]]:sreg_32 = COPY $sgpr2 ; GISEL-DAG: %[[BASE3:.*]]:sreg_32 = COPY $sgpr3 ; GISEL-DAG: %[[INDEX:.*]]:sreg_32 = COPY $sgpr4 ; GISEL-DAG: %[[SHIFT:.*]]:sreg_32 = S_LSHL_B32 %[[INDEX]], -; GISEL-DAG: %[[OR:.*]]:sreg_32 = S_OR_B32 %[[SHIFT]], ; GISEL-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3 -; GISEL: S_BUFFER_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[OR]], 0, +; GISEL: S_BUFFER_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[SHIFT]], 5, define amdgpu_cs void @test_buffer_load_sgpr_or_imm_offset(<4 x i32> inreg %base, i32 inreg %i, i32 addrspace(1)* inreg %out) { %shift = shl i32 %i, 7 %off = or i32 %shift, 5 -- 2.7.4