From: Marek Olsak Date: Thu, 9 Nov 2017 01:52:17 +0000 (+0000) Subject: AMDGPU: Fold immediate offset into BUFFER_LOAD_DWORD lowered from SMEM X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ffadcb744bc3d8ce0ca516bab09445643c1061a5;p=platform%2Fupstream%2Fllvm.git AMDGPU: Fold immediate offset into BUFFER_LOAD_DWORD lowered from SMEM Summary: -5.3% code size in affected shaders. Changed stats only: 48486 shaders in 30489 tests Totals: SGPRS: 2086406 -> 2072430 (-0.67 %) VGPRS: 1626872 -> 1627960 (0.07 %) Spilled SGPRs: 7865 -> 7912 (0.60 %) Code Size: 60978060 -> 60188764 (-1.29 %) bytes Max Waves: 374530 -> 374342 (-0.05 %) Totals from affected shaders: SGPRS: 299664 -> 285688 (-4.66 %) VGPRS: 233844 -> 234932 (0.47 %) Spilled SGPRs: 3959 -> 4006 (1.19 %) Code Size: 14905272 -> 14115976 (-5.30 %) bytes Max Waves: 46202 -> 46014 (-0.41 %) Reviewers: arsenm, nhaehnle Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye Differential Revision: https://reviews.llvm.org/D38915 llvm-svn: 317750 --- diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index f04efd71..d256261 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -983,14 +983,6 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, return true; } -static bool isLegalMUBUFImmOffset(unsigned Imm) { - return isUInt<12>(Imm); -} - -static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { - return isLegalMUBUFImmOffset(Imm->getZExtValue()); -} - bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, @@ -1032,7 +1024,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, Ptr = N0; } - if (isLegalMUBUFImmOffset(C1)) { + if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); return true; } @@ -1142,7 +1134,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, if (ConstantSDNode *CAddr = dyn_cast(Addr)) { unsigned Imm = CAddr->getZExtValue(); - assert(!isLegalMUBUFImmOffset(Imm) && + assert(!SIInstrInfo::isLegalMUBUFImmOffset(Imm) && "should have been selected by other pattern"); SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32); @@ -1169,7 +1161,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, // Offsets in vaddr must be positive. ConstantSDNode *C1 = cast(N1); - if (isLegalMUBUFImmOffset(C1)) { + if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { std::tie(VAddr, SOffset) = foldFrameIndex(N0); ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); return true; @@ -1188,7 +1180,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SDValue &SOffset, SDValue &Offset) const { ConstantSDNode *CAddr = dyn_cast(Addr); - if (!CAddr || !isLegalMUBUFImmOffset(CAddr)) + if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) return false; SDLoc DL(Addr); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 542bfda..5b85174 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3712,13 +3712,43 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: { unsigned VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff); + auto Add = MRI.getUniqueVRegDef(VAddr->getReg()); + unsigned Offset = 0; + + // See if we can extract an immediate offset by recognizing one of these: + // V_ADD_I32_e32 dst, imm, src1 + // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1 + // V_ADD will be removed by "Remove dead machine instructions". + if (Add && Add->getOpcode() == AMDGPU::V_ADD_I32_e32) { + const MachineOperand *Src = + getNamedOperand(*Add, AMDGPU::OpName::src0); + + if (Src && Src->isReg()) { + auto Mov = MRI.getUniqueVRegDef(Src->getReg()); + if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32) + Src = &Mov->getOperand(1); + } + + if (Src) { + if (Src->isImm()) + Offset = Src->getImm(); + else if (Src->isCImm()) + Offset = Src->getCImm()->getZExtValue(); + } + + if (Offset && isLegalMUBUFImmOffset(Offset)) + VAddr = getNamedOperand(*Add, AMDGPU::OpName::src1); + else + Offset = 0; + } BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst) - .add(*getNamedOperand(Inst, AMDGPU::OpName::soff)) // vaddr + .add(*VAddr) // vaddr .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc .addImm(0) // soffset - .addImm(0) // offset + .addImm(Offset) // offset .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm()) .addImm(0) // slc .addImm(0) // tfe diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index f8de0efc5..5e84e0c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -860,6 +860,10 @@ public: static bool isKillTerminator(unsigned Opcode); const MCInstrDesc &getKillTerminatorFromPseudo(unsigned Opcode) const; + + static bool isLegalMUBUFImmOffset(unsigned Imm) { + return isUInt<12>(Imm); + } }; namespace AMDGPU { diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll index cc2f6ce..ab7e742 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -191,6 +191,27 @@ main_body: ret float %r } +; GCN-LABEL: {{^}}smrd_vgpr_offset_imm: +; GCN-NEXT: BB# +; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4095 ; +define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 { +main_body: + %off = add i32 %offset, 4095 + %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %off) + ret float %r +} + +; GCN-LABEL: {{^}}smrd_vgpr_offset_imm_too_large: +; GCN-NEXT: BB# +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x1000, v0 +; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ; +define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 { +main_body: + %off = add i32 %offset, 4096 + %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %off) + ret float %r +} + declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1