From 0045786f146e78afee49eee053dc29ebc842fee1 Mon Sep 17 00:00:00 2001 From: Piotr Sobczak Date: Wed, 4 Mar 2020 15:13:08 +0100 Subject: [PATCH] [AMDGPU] Select s_cselect Summary: Add patterns to select s_cselect in the isel. Handle more cases of implicit SCC accesses in si-fix-sgpr-copies to allow new patterns to work. Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, asbirlea, kerbowa, llvm-commits Tags: #llvm Re-commit D81925 with a bugfix D82370. Differential Revision: https://reviews.llvm.org/D81925 Differential Revision: https://reviews.llvm.org/D82370 --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 128 +- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 3 + llvm/lib/Target/AMDGPU/SOPInstructions.td | 12 +- .../CodeGen/AMDGPU/32-bit-local-address-space.ll | 4 +- llvm/test/CodeGen/AMDGPU/addrspacecast.ll | 16 +- .../CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll | 1262 ++++++++++---------- llvm/test/CodeGen/AMDGPU/ctlz.ll | 66 +- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 7 +- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 4 +- llvm/test/CodeGen/AMDGPU/dagcombine-select.ll | 56 +- llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll | 10 +- llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll | 15 +- llvm/test/CodeGen/AMDGPU/fceil64.ll | 2 +- llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir | 18 +- llvm/test/CodeGen/AMDGPU/fshl.ll | 109 +- llvm/test/CodeGen/AMDGPU/fshr.ll | 66 +- llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll | 22 +- llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 394 +++--- llvm/test/CodeGen/AMDGPU/mad_uint24.ll | 2 +- llvm/test/CodeGen/AMDGPU/sad.ll | 14 +- llvm/test/CodeGen/AMDGPU/sdiv.ll | 12 +- llvm/test/CodeGen/AMDGPU/sdiv64.ll | 966 ++++++++------- llvm/test/CodeGen/AMDGPU/select-i1.ll | 2 +- llvm/test/CodeGen/AMDGPU/select-opt.ll | 24 +- llvm/test/CodeGen/AMDGPU/select-vectors.ll | 12 +- llvm/test/CodeGen/AMDGPU/select64.ll | 12 +- llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll | 5 +- llvm/test/CodeGen/AMDGPU/srem64.ll | 1250 ++++++++++--------- llvm/test/CodeGen/AMDGPU/trunc.ll | 3 +- llvm/test/CodeGen/AMDGPU/udiv64.ll | 807 ++++++------- llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 5 +- llvm/test/CodeGen/AMDGPU/urem64.ll | 663 +++++----- llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll | 21 +- llvm/test/CodeGen/AMDGPU/vselect.ll | 16 +- llvm/test/CodeGen/AMDGPU/wave32.ll | 6 +- 35 files changed, 3063 insertions(+), 2951 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 64dfd4a..ffb4073 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -602,6 +602,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } if (RC == &AMDGPU::SReg_64RegClass) { + if (SrcReg == AMDGPU::SCC) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg) + .addImm(1) + .addImm(0); + return; + } + if (DestReg == AMDGPU::VCC) { if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) @@ -628,10 +635,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } if (DestReg == AMDGPU::SCC) { + // Copying 64-bit or 32-bit sources to SCC barely makes sense, + // but SelectionDAG emits such copies for i1 sources. + // TODO: Use S_BITCMP0_B32 instead and only consider the 0th bit. + if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { + SrcReg = RI.getSubReg(SrcReg, AMDGPU::sub0); + } assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) - .addReg(SrcReg, getKillRegState(KillSrc)) - .addImm(0); + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(0); + return; } @@ -5368,6 +5383,12 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, Inst.eraseFromParent(); } continue; + + case AMDGPU::S_CSELECT_B32: + case AMDGPU::S_CSELECT_B64: + lowerSelect(Worklist, Inst, MDT); + Inst.eraseFromParent(); + continue; } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { @@ -5508,6 +5529,78 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, return false; } +void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT) const { + + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst.getDebugLoc(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + MachineOperand &Cond = Inst.getOperand(3); + + Register SCCSource = Cond.getReg(); + // Find SCC def, and if that is a copy (SCC = COPY reg) then use reg instead. + if (!Cond.isUndef()) { + for (MachineInstr &CandI : + make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)), + Inst.getParent()->rend())) { + if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != + -1) { + if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) { + SCCSource = CandI.getOperand(1).getReg(); + } + break; + } + } + } + + // If this is a trivial select where the condition is effectively not SCC + // (SCCSource is a source of copy to SCC), then the select is semantically + // equivalent to copying SCCSource. Hence, there is no need to create + // V_CNDMASK, we can just use that and bail out. + if ((SCCSource != AMDGPU::SCC) && Src0.isImm() && (Src0.getImm() == -1) && + Src1.isImm() && (Src1.getImm() == 0)) { + MRI.replaceRegWith(Dest.getReg(), SCCSource); + return; + } + + const TargetRegisterClass *TC = ST.getWavefrontSize() == 64 + ? &AMDGPU::SReg_64_XEXECRegClass + : &AMDGPU::SReg_32_XM0_XEXECRegClass; + Register CopySCC = MRI.createVirtualRegister(TC); + + if (SCCSource == AMDGPU::SCC) { + // Insert a trivial select instead of creating a copy, because a copy from + // SCC would semantically mean just copying a single bit, but we may need + // the result to be a vector condition mask that needs preserving. + unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64 + : AMDGPU::S_CSELECT_B32; + auto NewSelect = + BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0); + NewSelect->getOperand(3).setIsUndef(Cond.isUndef()); + } else { + BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC).addReg(SCCSource); + } + + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + auto UpdatedInst = + BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg) + .addImm(0) + .add(Src1) // False + .addImm(0) + .add(Src0) // True + .addReg(CopySCC); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + legalizeOperands(*UpdatedInst, MDT); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); +} + void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -6089,6 +6182,8 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, MachineInstr &SCCDefInst, SetVectorType &Worklist) const { + bool SCCUsedImplicitly = false; + // Ensure that def inst defines SCC, which is still live. assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst); @@ -6103,19 +6198,32 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, if (MI.isCopy()) { MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); unsigned DestReg = MI.getOperand(0).getReg(); - SmallVector Users; + for (auto &User : MRI.use_nodbg_instructions(DestReg)) { if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) || (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) { - Users.push_back(&User); + User.getOperand(4).setReg(RI.getVCC()); Worklist.insert(&User); + } else if (User.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) { + User.getOperand(5).setReg(RI.getVCC()); + // No need to add to Worklist. } } - for (auto &U : Users) - U->getOperand(4).setReg(RI.getVCC()); CopyToDelete.push_back(&MI); - } else + } else { + if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 || + MI.getOpcode() == AMDGPU::S_CSELECT_B64) { + // This is an implicit use of SCC and it is really expected by + // the SCC users to handle. + // We cannot preserve the edge to the user so add the explicit + // copy: SCC = COPY VCC. + // The copy will be cleaned up during the processing of the user + // in lowerSelect. + SCCUsedImplicitly = true; + } + Worklist.insert(&MI); + } } // Exit if we find another SCC def. if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) @@ -6123,6 +6231,12 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, } for (auto &Copy : CopyToDelete) Copy->eraseFromParent(); + + if (SCCUsedImplicitly) { + BuildMI(*SCCDefInst.getParent(), std::next(SCCDefInst.getIterator()), + SCCDefInst.getDebugLoc(), get(AMDGPU::COPY), AMDGPU::SCC) + .addReg(RI.getVCC()); + } } const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 0f0e842..53e2ffb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -84,6 +84,9 @@ private: bool moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; + void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT = nullptr) const; + void lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index dd6363b..9d7b25d 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -465,9 +465,19 @@ def S_MAX_U32 : SOP2_32 <"s_max_u32", } // End isCommutable = 1 } // End Defs = [SCC] +class SelectPat : PatFrag < + (ops node:$src1, node:$src2), + (select SCC, $src1, $src2), + [{ return N->getOperand(0)->hasOneUse() && !N->isDivergent(); }] +>; let Uses = [SCC] in { - def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32">; + let AddedComplexity = 20 in { + def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32", + [(set i32:$sdst, (SelectPat