From b434051dc83d77c8e8e349ab1992dcb0c795a7ea Mon Sep 17 00:00:00 2001 From: skc7 Date: Tue, 28 Mar 2023 23:35:02 +0530 Subject: [PATCH] [AMDGPU] Introduce SIInstrWorklist to process instructions in moveToVALU Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D147168 --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 13 +- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 794 +++++++++---------- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 91 ++- llvm/test/CodeGen/AMDGPU/add3.ll | 2 +- .../CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll | 110 +-- llvm/test/CodeGen/AMDGPU/carryout-selection.ll | 12 +- .../CodeGen/AMDGPU/control-flow-fastregalloc.ll | 16 +- llvm/test/CodeGen/AMDGPU/ds_read2.ll | 2 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll | 2 +- .../test/CodeGen/AMDGPU/mubuf-legalize-operands.ll | 18 - .../CodeGen/AMDGPU/mubuf-legalize-operands.mir | 75 +- llvm/test/CodeGen/AMDGPU/mul.ll | 4 +- llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir | 6 +- llvm/test/CodeGen/AMDGPU/sdiv.ll | 140 ++-- llvm/test/CodeGen/AMDGPU/sdiv64.ll | 16 +- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 4 +- .../CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll | 2 - llvm/test/CodeGen/AMDGPU/shl.ll | 8 +- .../CodeGen/AMDGPU/splitkit-getsubrangeformask.ll | 4 +- llvm/test/CodeGen/AMDGPU/sra.ll | 6 +- llvm/test/CodeGen/AMDGPU/srem64.ll | 18 +- llvm/test/CodeGen/AMDGPU/srl.ll | 8 +- llvm/test/CodeGen/AMDGPU/sub.ll | 841 ++++++++++++++++++--- llvm/test/CodeGen/AMDGPU/udiv.ll | 88 +-- llvm/test/CodeGen/AMDGPU/udiv64.ll | 28 +- llvm/test/CodeGen/AMDGPU/udivrem.ll | 16 +- llvm/test/CodeGen/AMDGPU/urem64.ll | 16 +- llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll | 34 +- llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 240 +++--- 29 files changed, 1595 insertions(+), 1019 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 74ede65..db32346 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -869,7 +869,9 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, return true; } if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { - TII->moveToVALU(MI, MDT); + SIInstrWorklist worklist; + worklist.insert(&MI); + TII->moveToVALU(worklist, MDT); return true; } @@ -991,6 +993,10 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { LoweringWorklist.push_back(C.second.ID); } + // Store all the V2S copy instructions that need to be moved to VALU + // in the Copies worklist. + SIInstrWorklist Copies; + while (!LoweringWorklist.empty()) { unsigned CurID = LoweringWorklist.pop_back_val(); auto CurInfoIt = V2SCopies.find(CurID); @@ -1013,10 +1019,13 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy << " is being turned to VALU\n"); V2SCopies.erase(C.ID); - TII->moveToVALU(*C.Copy, MDT); + Copies.insert(C.Copy); } } + TII->moveToVALU(Copies, MDT); + Copies.clear(); + // Now do actual lowering for (auto C : V2SCopies) { MachineInstr *MI = C.second.Copy; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 018f671..7ffcd1b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6161,424 +6161,427 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, return CreatedBB; } -MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, - MachineDominatorTree *MDT) const { - SetVectorType Worklist; - Worklist.insert(&TopInst); - MachineBasicBlock *CreatedBB = nullptr; - MachineBasicBlock *CreatedBBTmp = nullptr; - - while (!Worklist.empty()) { - MachineInstr &Inst = *Worklist.pop_back_val(); - MachineBasicBlock *MBB = Inst.getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - - unsigned Opcode = Inst.getOpcode(); - unsigned NewOpcode = getVALUOp(Inst); - - // Handle some special cases - switch (Opcode) { - default: - break; - case AMDGPU::S_ADD_U64_PSEUDO: - case AMDGPU::S_SUB_U64_PSEUDO: - splitScalar64BitAddSub(Worklist, Inst, MDT); - Inst.eraseFromParent(); - continue; - case AMDGPU::S_ADD_I32: - case AMDGPU::S_SUB_I32: { - // FIXME: The u32 versions currently selected use the carry. - bool Changed; - std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; - if (Changed) - continue; - - // Default handling - break; - } - case AMDGPU::S_AND_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); - Inst.eraseFromParent(); - continue; - - case AMDGPU::S_OR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); - Inst.eraseFromParent(); - continue; - - case AMDGPU::S_XOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); - Inst.eraseFromParent(); - continue; - - case AMDGPU::S_NAND_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); - Inst.eraseFromParent(); - continue; - - case AMDGPU::S_NOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); - Inst.eraseFromParent(); - continue; +void SIInstrWorklist::insert(MachineInstr *MI) { + InstrList.insert(MI); + // Add MBUF instructiosn to deferred list. + int RsrcIdx = + AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); + if (RsrcIdx != -1) { + DeferredList.insert(MI); + } +} - case AMDGPU::S_XNOR_B64: - if (ST.hasDLInsts()) - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); - else - splitScalar64BitXnor(Worklist, Inst, MDT); - Inst.eraseFromParent(); - continue; +bool SIInstrWorklist::isDeferred(MachineInstr *MI) { + return DeferredList.contains(MI); +} - case AMDGPU::S_ANDN2_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); - Inst.eraseFromParent(); - continue; +void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist, + MachineDominatorTree *MDT) const { - case AMDGPU::S_ORN2_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); - Inst.eraseFromParent(); + while (!Worklist.empty()) { + MachineInstr &Inst = *Worklist.top(); + Worklist.erase_top(); + // Skip MachineInstr in the deferred list. + if (Worklist.isDeferred(&Inst)) continue; + moveToVALUImpl(Worklist, MDT, Inst); + } - case AMDGPU::S_BREV_B64: - splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true); - Inst.eraseFromParent(); - continue; + // Deferred list of instructions will be processed once + // all the MachineInstr in the worklist are done. + for (MachineInstr *Inst : Worklist.getDeferredList()) { + moveToVALUImpl(Worklist, MDT, *Inst); + assert(Worklist.empty() && + "Deferred MachineInstr are not supposed to re-populate worklist"); + } +} - case AMDGPU::S_NOT_B64: - splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); - Inst.eraseFromParent(); - continue; +void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, + MachineDominatorTree *MDT, + MachineInstr &Inst) const { - case AMDGPU::S_BCNT1_I32_B64: - splitScalar64BitBCNT(Worklist, Inst); - Inst.eraseFromParent(); - continue; + MachineBasicBlock *MBB = Inst.getParent(); + if (!MBB) + return; + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned Opcode = Inst.getOpcode(); + unsigned NewOpcode = getVALUOp(Inst); + // Handle some special cases + switch (Opcode) { + default: + break; + case AMDGPU::S_ADD_U64_PSEUDO: + case AMDGPU::S_SUB_U64_PSEUDO: + splitScalar64BitAddSub(Worklist, Inst, MDT); + Inst.eraseFromParent(); + return; + case AMDGPU::S_ADD_I32: + case AMDGPU::S_SUB_I32: { + // FIXME: The u32 versions currently selected use the carry. + bool Changed; + MachineBasicBlock *CreatedBBTmp = nullptr; + std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT); + if (Changed) + return; - case AMDGPU::S_BFE_I64: - splitScalar64BitBFE(Worklist, Inst); - Inst.eraseFromParent(); - continue; + // Default handling + break; + } + case AMDGPU::S_AND_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); + Inst.eraseFromParent(); + return; - case AMDGPU::S_LSHL_B32: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_LSHLREV_B32_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_ASHR_I32: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_ASHRREV_I32_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_LSHR_B32: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_LSHRREV_B32_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_LSHL_B64: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_LSHLREV_B64_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_ASHR_I64: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_ASHRREV_I64_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_LSHR_B64: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_LSHRREV_B64_e64; - swapOperands(Inst); - } - break; + case AMDGPU::S_OR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); + Inst.eraseFromParent(); + return; - case AMDGPU::S_ABS_I32: - lowerScalarAbs(Worklist, Inst); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_XOR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); + Inst.eraseFromParent(); + return; - case AMDGPU::S_CBRANCH_SCC0: - case AMDGPU::S_CBRANCH_SCC1: { - // Clear unused bits of vcc - Register CondReg = Inst.getOperand(1).getReg(); - bool IsSCC = CondReg == AMDGPU::SCC; - Register VCC = RI.getVCC(); - Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; - BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC) - .addReg(EXEC) - .addReg(IsSCC ? VCC : CondReg); - Inst.removeOperand(1); - } - break; + case AMDGPU::S_NAND_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); + Inst.eraseFromParent(); + return; - case AMDGPU::S_BFE_U64: - case AMDGPU::S_BFM_B64: - llvm_unreachable("Moving this op to VALU not implemented"); + case AMDGPU::S_NOR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); + Inst.eraseFromParent(); + return; - case AMDGPU::S_PACK_LL_B32_B16: - case AMDGPU::S_PACK_LH_B32_B16: - case AMDGPU::S_PACK_HL_B32_B16: - case AMDGPU::S_PACK_HH_B32_B16: - movePackToVALU(Worklist, MRI, Inst); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_XNOR_B64: + if (ST.hasDLInsts()) + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); + else + splitScalar64BitXnor(Worklist, Inst, MDT); + Inst.eraseFromParent(); + return; - case AMDGPU::S_XNOR_B32: - lowerScalarXnor(Worklist, Inst); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_ANDN2_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); + Inst.eraseFromParent(); + return; - case AMDGPU::S_NAND_B32: - splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_ORN2_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); + Inst.eraseFromParent(); + return; - case AMDGPU::S_NOR_B32: - splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_BREV_B64: + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true); + Inst.eraseFromParent(); + return; - case AMDGPU::S_ANDN2_B32: - splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_NOT_B64: + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); + Inst.eraseFromParent(); + return; - case AMDGPU::S_ORN2_B32: - splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_BCNT1_I32_B64: + splitScalar64BitBCNT(Worklist, Inst); + Inst.eraseFromParent(); + return; - // TODO: remove as soon as everything is ready - // to replace VGPR to SGPR copy with V_READFIRSTLANEs. - // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO - // can only be selected from the uniform SDNode. - case AMDGPU::S_ADD_CO_PSEUDO: - case AMDGPU::S_SUB_CO_PSEUDO: { - unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) - ? AMDGPU::V_ADDC_U32_e64 - : AMDGPU::V_SUBB_U32_e64; - const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - - Register CarryInReg = Inst.getOperand(4).getReg(); - if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { - Register NewCarryReg = MRI.createVirtualRegister(CarryRC); - BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) - .addReg(CarryInReg); - } + case AMDGPU::S_BFE_I64: + splitScalar64BitBFE(Worklist, Inst); + Inst.eraseFromParent(); + return; - Register CarryOutReg = Inst.getOperand(1).getReg(); - - Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( - MRI.getRegClass(Inst.getOperand(0).getReg()))); - MachineInstr *CarryOp = - BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) - .addReg(CarryOutReg, RegState::Define) - .add(Inst.getOperand(2)) - .add(Inst.getOperand(3)) - .addReg(CarryInReg) - .addImm(0); - CreatedBBTmp = legalizeOperands(*CarryOp); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; - MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); - addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); - Inst.eraseFromParent(); + case AMDGPU::S_LSHL_B32: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_LSHLREV_B32_e64; + swapOperands(Inst); } - continue; - case AMDGPU::S_UADDO_PSEUDO: - case AMDGPU::S_USUBO_PSEUDO: { - const DebugLoc &DL = Inst.getDebugLoc(); - MachineOperand &Dest0 = Inst.getOperand(0); - MachineOperand &Dest1 = Inst.getOperand(1); - MachineOperand &Src0 = Inst.getOperand(2); - MachineOperand &Src1 = Inst.getOperand(3); - - unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) - ? AMDGPU::V_ADD_CO_U32_e64 - : AMDGPU::V_SUB_CO_U32_e64; - const TargetRegisterClass *NewRC = - RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); - Register DestReg = MRI.createVirtualRegister(NewRC); - MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) - .addReg(Dest1.getReg(), RegState::Define) - .add(Src0) - .add(Src1) - .addImm(0); // clamp bit - - CreatedBBTmp = legalizeOperands(*NewInstr, MDT); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; - - MRI.replaceRegWith(Dest0.getReg(), DestReg); - addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, - Worklist); - Inst.eraseFromParent(); + break; + case AMDGPU::S_ASHR_I32: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_ASHRREV_I32_e64; + swapOperands(Inst); } - continue; - - case AMDGPU::S_CSELECT_B32: - case AMDGPU::S_CSELECT_B64: - lowerSelect(Worklist, Inst, MDT); - Inst.eraseFromParent(); - continue; - case AMDGPU::S_CMP_EQ_I32: - case AMDGPU::S_CMP_LG_I32: - case AMDGPU::S_CMP_GT_I32: - case AMDGPU::S_CMP_GE_I32: - case AMDGPU::S_CMP_LT_I32: - case AMDGPU::S_CMP_LE_I32: - case AMDGPU::S_CMP_EQ_U32: - case AMDGPU::S_CMP_LG_U32: - case AMDGPU::S_CMP_GT_U32: - case AMDGPU::S_CMP_GE_U32: - case AMDGPU::S_CMP_LT_U32: - case AMDGPU::S_CMP_LE_U32: - case AMDGPU::S_CMP_EQ_U64: - case AMDGPU::S_CMP_LG_U64: { - const MCInstrDesc &NewDesc = get(NewOpcode); - Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass()); - MachineInstr *NewInstr = - BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg) - .add(Inst.getOperand(0)) - .add(Inst.getOperand(1)); - legalizeOperands(*NewInstr, MDT); - int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC); - MachineOperand SCCOp = Inst.getOperand(SCCIdx); - addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); - Inst.eraseFromParent(); - } - continue; + break; + case AMDGPU::S_LSHR_B32: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_LSHRREV_B32_e64; + swapOperands(Inst); } - - if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { - // We cannot move this instruction to the VALU, so we should try to - // legalize its operands instead. - CreatedBBTmp = legalizeOperands(Inst, MDT); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; - continue; + break; + case AMDGPU::S_LSHL_B64: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_LSHLREV_B64_e64; + swapOperands(Inst); } - - // Handle converting generic instructions like COPY-to-SGPR into - // COPY-to-VGPR. - if (NewOpcode == Opcode) { - Register DstReg = Inst.getOperand(0).getReg(); - const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); - - if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && - NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { - // Instead of creating a copy where src and dst are the same register - // class, we just replace all uses of dst with src. These kinds of - // copies interfere with the heuristics MachineSink uses to decide - // whether or not to split a critical edge. Since the pass assumes - // that copies will end up as machine instructions and not be - // eliminated. - addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); - MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); - MRI.clearKillFlags(Inst.getOperand(1).getReg()); - Inst.getOperand(0).setReg(DstReg); - - // Make sure we don't leave around a dead VGPR->SGPR copy. Normally - // these are deleted later, but at -O0 it would leave a suspicious - // looking illegal copy of an undef register. - for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) - Inst.removeOperand(I); - Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); - continue; - } - - Register NewDstReg = MRI.createVirtualRegister(NewDstRC); - MRI.replaceRegWith(DstReg, NewDstReg); - legalizeOperands(Inst, MDT); - addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); - continue; + break; + case AMDGPU::S_ASHR_I64: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_ASHRREV_I64_e64; + swapOperands(Inst); } - - // Use the new VALU Opcode. - auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode)) - .setMIFlags(Inst.getFlags()); - for (const MachineOperand &Op : Inst.explicit_operands()) - NewInstr->addOperand(Op); - - // Remove any references to SCC. Vector instructions can't read from it, and - // We're just about to add the implicit use / defs of VCC, and we don't want - // both. - for (MachineOperand &Op : Inst.implicit_operands()) { - if (Op.getReg() == AMDGPU::SCC) { - // Only propagate through live-def of SCC. - if (Op.isDef() && !Op.isDead()) - addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); - if (Op.isUse()) - addSCCDefsToVALUWorklist(NewInstr, Worklist); - } + break; + case AMDGPU::S_LSHR_B64: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_LSHRREV_B64_e64; + swapOperands(Inst); } + break; + case AMDGPU::S_ABS_I32: + lowerScalarAbs(Worklist, Inst); Inst.eraseFromParent(); + return; - Register NewDstReg; - if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) { - Register DstReg = NewInstr->getOperand(0).getReg(); - assert(DstReg.isVirtual()); + case AMDGPU::S_CBRANCH_SCC0: + case AMDGPU::S_CBRANCH_SCC1: { + // Clear unused bits of vcc + Register CondReg = Inst.getOperand(1).getReg(); + bool IsSCC = CondReg == AMDGPU::SCC; + Register VCC = RI.getVCC(); + Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC) + .addReg(EXEC) + .addReg(IsSCC ? VCC : CondReg); + Inst.removeOperand(1); + } break; + + case AMDGPU::S_BFE_U64: + case AMDGPU::S_BFM_B64: + llvm_unreachable("Moving this op to VALU not implemented"); + + case AMDGPU::S_PACK_LL_B32_B16: + case AMDGPU::S_PACK_LH_B32_B16: + case AMDGPU::S_PACK_HL_B32_B16: + case AMDGPU::S_PACK_HH_B32_B16: + movePackToVALU(Worklist, MRI, Inst); + Inst.eraseFromParent(); + return; - // Update the destination register class. - const TargetRegisterClass *NewDstRC = - getDestEquivalentVGPRClass(*NewInstr); - assert(NewDstRC); + case AMDGPU::S_XNOR_B32: + lowerScalarXnor(Worklist, Inst); + Inst.eraseFromParent(); + return; - NewDstReg = MRI.createVirtualRegister(NewDstRC); - MRI.replaceRegWith(DstReg, NewDstReg); - } + case AMDGPU::S_NAND_B32: + splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); + Inst.eraseFromParent(); + return; - if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { - // We are converting these to a BFE, so we need to add the missing - // operands for the size and offset. - unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; - NewInstr.addImm(0); - NewInstr.addImm(Size); - } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { - // The VALU version adds the second operand to the result, so insert an - // extra 0 operand. - NewInstr.addImm(0); - } + case AMDGPU::S_NOR_B32: + splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); + Inst.eraseFromParent(); + return; - if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { - const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2); - // If we need to move this to VGPRs, we need to unpack the second operand - // back into the 2 separate ones for bit offset and width. - assert(OffsetWidthOp.isImm() && - "Scalar BFE is only implemented for constant width and offset"); - uint32_t Imm = OffsetWidthOp.getImm(); + case AMDGPU::S_ANDN2_B32: + splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); + Inst.eraseFromParent(); + return; - uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. - uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. - NewInstr->removeOperand(2); - NewInstr.addImm(Offset); - NewInstr.addImm(BitWidth); - } + case AMDGPU::S_ORN2_B32: + splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); + Inst.eraseFromParent(); + return; - fixImplicitOperands(*NewInstr); + // TODO: remove as soon as everything is ready + // to replace VGPR to SGPR copy with V_READFIRSTLANEs. + // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO + // can only be selected from the uniform SDNode. + case AMDGPU::S_ADD_CO_PSEUDO: + case AMDGPU::S_SUB_CO_PSEUDO: { + unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) + ? AMDGPU::V_ADDC_U32_e64 + : AMDGPU::V_SUBB_U32_e64; + const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + + Register CarryInReg = Inst.getOperand(4).getReg(); + if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { + Register NewCarryReg = MRI.createVirtualRegister(CarryRC); + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) + .addReg(CarryInReg); + } + + Register CarryOutReg = Inst.getOperand(1).getReg(); + + Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( + MRI.getRegClass(Inst.getOperand(0).getReg()))); + MachineInstr *CarryOp = + BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) + .addReg(CarryOutReg, RegState::Define) + .add(Inst.getOperand(2)) + .add(Inst.getOperand(3)) + .addReg(CarryInReg) + .addImm(0); + legalizeOperands(*CarryOp); + MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); + addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); + Inst.eraseFromParent(); + } + return; + case AMDGPU::S_UADDO_PSEUDO: + case AMDGPU::S_USUBO_PSEUDO: { + const DebugLoc &DL = Inst.getDebugLoc(); + MachineOperand &Dest0 = Inst.getOperand(0); + MachineOperand &Dest1 = Inst.getOperand(1); + MachineOperand &Src0 = Inst.getOperand(2); + MachineOperand &Src1 = Inst.getOperand(3); + + unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) + ? AMDGPU::V_ADD_CO_U32_e64 + : AMDGPU::V_SUB_CO_U32_e64; + const TargetRegisterClass *NewRC = + RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); + Register DestReg = MRI.createVirtualRegister(NewRC); + MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) + .addReg(Dest1.getReg(), RegState::Define) + .add(Src0) + .add(Src1) + .addImm(0); // clamp bit + + legalizeOperands(*NewInstr, MDT); + MRI.replaceRegWith(Dest0.getReg(), DestReg); + addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, + Worklist); + Inst.eraseFromParent(); + } + return; - // Legalize the operands - CreatedBBTmp = legalizeOperands(*NewInstr, MDT); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; + case AMDGPU::S_CSELECT_B32: + case AMDGPU::S_CSELECT_B64: + lowerSelect(Worklist, Inst, MDT); + Inst.eraseFromParent(); + return; + case AMDGPU::S_CMP_EQ_I32: + case AMDGPU::S_CMP_LG_I32: + case AMDGPU::S_CMP_GT_I32: + case AMDGPU::S_CMP_GE_I32: + case AMDGPU::S_CMP_LT_I32: + case AMDGPU::S_CMP_LE_I32: + case AMDGPU::S_CMP_EQ_U32: + case AMDGPU::S_CMP_LG_U32: + case AMDGPU::S_CMP_GT_U32: + case AMDGPU::S_CMP_GE_U32: + case AMDGPU::S_CMP_LT_U32: + case AMDGPU::S_CMP_LE_U32: + case AMDGPU::S_CMP_EQ_U64: + case AMDGPU::S_CMP_LG_U64: { + const MCInstrDesc &NewDesc = get(NewOpcode); + Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass()); + MachineInstr *NewInstr = + BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg) + .add(Inst.getOperand(0)) + .add(Inst.getOperand(1)); + legalizeOperands(*NewInstr, MDT); + int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC); + MachineOperand SCCOp = Inst.getOperand(SCCIdx); + addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); + Inst.eraseFromParent(); + } + return; + } - if (NewDstReg) - addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { + // We cannot move this instruction to the VALU, so we should try to + // legalize its operands instead. + legalizeOperands(Inst, MDT); + return; } - return CreatedBB; + // Handle converting generic instructions like COPY-to-SGPR into + // COPY-to-VGPR. + if (NewOpcode == Opcode) { + Register DstReg = Inst.getOperand(0).getReg(); + const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); + + if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && + NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { + // Instead of creating a copy where src and dst are the same register + // class, we just replace all uses of dst with src. These kinds of + // copies interfere with the heuristics MachineSink uses to decide + // whether or not to split a critical edge. Since the pass assumes + // that copies will end up as machine instructions and not be + // eliminated. + addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); + MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); + MRI.clearKillFlags(Inst.getOperand(1).getReg()); + Inst.getOperand(0).setReg(DstReg); + // Make sure we don't leave around a dead VGPR->SGPR copy. Normally + // these are deleted later, but at -O0 it would leave a suspicious + // looking illegal copy of an undef register. + for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) + Inst.removeOperand(I); + Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); + return; + } + Register NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + legalizeOperands(Inst, MDT); + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + return; + } + + // Use the new VALU Opcode. + auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode)) + .setMIFlags(Inst.getFlags()); + for (const MachineOperand &Op : Inst.explicit_operands()) + NewInstr->addOperand(Op); + // Remove any references to SCC. Vector instructions can't read from it, and + // We're just about to add the implicit use / defs of VCC, and we don't want + // both. + for (MachineOperand &Op : Inst.implicit_operands()) { + if (Op.getReg() == AMDGPU::SCC) { + // Only propagate through live-def of SCC. + if (Op.isDef() && !Op.isDead()) + addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); + if (Op.isUse()) + addSCCDefsToVALUWorklist(NewInstr, Worklist); + } + } + Inst.eraseFromParent(); + Register NewDstReg; + if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) { + Register DstReg = NewInstr->getOperand(0).getReg(); + assert(DstReg.isVirtual()); + // Update the destination register class. + const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr); + assert(NewDstRC); + NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + } + if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { + // We are converting these to a BFE, so we need to add the missing + // operands for the size and offset. + unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; + NewInstr.addImm(0); + NewInstr.addImm(Size); + } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { + // The VALU version adds the second operand to the result, so insert an + // extra 0 operand. + NewInstr.addImm(0); + } + if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { + const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2); + // If we need to move this to VGPRs, we need to unpack the second operand + // back into the 2 separate ones for bit offset and width. + assert(OffsetWidthOp.isImm() && + "Scalar BFE is only implemented for constant width and offset"); + uint32_t Imm = OffsetWidthOp.getImm(); + uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. + uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. + NewInstr->removeOperand(2); + NewInstr.addImm(Offset); + NewInstr.addImm(BitWidth); + } + fixImplicitOperands(*NewInstr); + // Legalize the operands + legalizeOperands(*NewInstr, MDT); + if (NewDstReg) + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + return; } // Add/sub require special handling to deal with carry outs. std::pair -SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, +SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT) const { if (ST.hasAddNoCarry()) { // Assume there is no user of scc since we don't select this in that case. @@ -6613,7 +6616,7 @@ SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, return std::pair(false, nullptr); } -void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, +void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -6689,7 +6692,7 @@ void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, +void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6716,7 +6719,7 @@ void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, +void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6781,7 +6784,7 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, } } -void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, +void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -6810,7 +6813,7 @@ void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } -void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, +void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -6839,9 +6842,9 @@ void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitUnaryOp( - SetVectorType &Worklist, MachineInstr &Inst, - unsigned Opcode, bool Swap) const { +void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist, + MachineInstr &Inst, unsigned Opcode, + bool Swap) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6898,7 +6901,7 @@ void SIInstrInfo::splitScalar64BitUnaryOp( addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, +void SIInstrInfo::splitScalar64BitAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT) const { bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); @@ -6972,7 +6975,7 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, +void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode, MachineDominatorTree *MDT) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -7039,7 +7042,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, +void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -7081,8 +7084,8 @@ void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, Worklist.insert(&Xor); } -void SIInstrInfo::splitScalar64BitBCNT( - SetVectorType &Worklist, MachineInstr &Inst) const { +void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist, + MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -7119,7 +7122,7 @@ void SIInstrInfo::splitScalar64BitBCNT( addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, +void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -7181,9 +7184,8 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, } void SIInstrInfo::addUsersToMoveToVALUWorklist( - Register DstReg, - MachineRegisterInfo &MRI, - SetVectorType &Worklist) const { + Register DstReg, MachineRegisterInfo &MRI, + SIInstrWorklist &Worklist) const { for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), E = MRI.use_end(); I != E;) { MachineInstr &UseMI = *I->getParent(); @@ -7217,7 +7219,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( } } -void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, +void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const { Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -7292,7 +7294,7 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, MachineInstr &SCCDefInst, - SetVectorType &Worklist, + SIInstrWorklist &Worklist, Register NewCond) const { // Ensure that def inst defines SCC, which is still live. @@ -7335,7 +7337,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, // sure that the instruction that defines SCC is added to the moveToVALU // worklist. void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst, - SetVectorType &Worklist) const { + SIInstrWorklist &Worklist) const { // Look for a preceding instruction that either defines VCC or SCC. If VCC // then there is nothing to do because the defining instruction has been // converted to a VALU already. If SCC then that instruction needs to be diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index be1bc0d..10fed54 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -41,6 +41,41 @@ class ScheduleHazardRecognizer; static const MachineMemOperand::Flags MONoClobber = MachineMemOperand::MOTargetFlag1; +/// Utility to store machine instructions worklist. +struct SIInstrWorklist { + SIInstrWorklist() : InstrList() {} + + void insert(MachineInstr *MI); + + MachineInstr *top() const { + auto iter = InstrList.begin(); + return *iter; + } + + void erase_top() { + auto iter = InstrList.begin(); + InstrList.erase(iter); + } + + bool empty() const { return InstrList.empty(); } + + void clear() { + InstrList.clear(); + DeferredList.clear(); + } + + bool isDeferred(MachineInstr *MI); + + SetVector &getDeferredList() { return DeferredList; } + +private: + /// InstrList contains the MachineInstrs. + SetVector InstrList; + /// Deferred instructions are specific MachineInstr + /// that will be added by insert method. + SetVector DeferredList; +}; + class SIInstrInfo final : public AMDGPUGenInstrInfo { private: const SIRegisterInfo RI; @@ -81,57 +116,50 @@ private: void swapOperands(MachineInstr &Inst) const; std::pair - moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, + moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; - void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, + void lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; - void lowerScalarAbs(SetVectorType &Worklist, - MachineInstr &Inst) const; + void lowerScalarAbs(SIInstrWorklist &Worklist, MachineInstr &Inst) const; - void lowerScalarXnor(SetVectorType &Worklist, - MachineInstr &Inst) const; + void lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const; - void splitScalarNotBinop(SetVectorType &Worklist, - MachineInstr &Inst, + void splitScalarNotBinop(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode) const; - void splitScalarBinOpN2(SetVectorType &Worklist, - MachineInstr &Inst, + void splitScalarBinOpN2(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode) const; - void splitScalar64BitUnaryOp(SetVectorType &Worklist, - MachineInstr &Inst, unsigned Opcode, - bool Swap = false) const; + void splitScalar64BitUnaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst, + unsigned Opcode, bool Swap = false) const; - void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst, + void splitScalar64BitAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; - void splitScalar64BitBinaryOp(SetVectorType &Worklist, MachineInstr &Inst, + void splitScalar64BitBinaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode, MachineDominatorTree *MDT = nullptr) const; - void splitScalar64BitXnor(SetVectorType &Worklist, MachineInstr &Inst, - MachineDominatorTree *MDT = nullptr) const; + void splitScalar64BitXnor(SIInstrWorklist &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT = nullptr) const; - void splitScalar64BitBCNT(SetVectorType &Worklist, + void splitScalar64BitBCNT(SIInstrWorklist &Worklist, MachineInstr &Inst) const; - void splitScalar64BitBFE(SetVectorType &Worklist, - MachineInstr &Inst) const; - void movePackToVALU(SetVectorType &Worklist, - MachineRegisterInfo &MRI, + void splitScalar64BitBFE(SIInstrWorklist &Worklist, MachineInstr &Inst) const; + void movePackToVALU(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const; void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI, - SetVectorType &Worklist) const; + SIInstrWorklist &Worklist) const; void addSCCDefUsersToVALUWorklist(MachineOperand &Op, MachineInstr &SCCDefInst, - SetVectorType &Worklist, + SIInstrWorklist &Worklist, Register NewCond = Register()) const; void addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst, - SetVectorType &Worklist) const; + SIInstrWorklist &Worklist) const; const TargetRegisterClass * getDestEquivalentVGPRClass(const MachineInstr &Inst) const; @@ -1008,11 +1036,14 @@ public: /// was moved to VGPR. \returns true if succeeded. bool moveFlatAddrToVGPR(MachineInstr &Inst) const; - /// Replace this instruction's opcode with the equivalent VALU - /// opcode. This function will also move the users of \p MI to the - /// VALU if necessary. If present, \p MDT is updated. - MachineBasicBlock *moveToVALU(MachineInstr &MI, - MachineDominatorTree *MDT = nullptr) const; + /// Replace the instructions opcode with the equivalent VALU + /// opcode. This function will also move the users of MachineInstruntions + /// in the \p WorkList to the VALU if necessary. If present, \p MDT is + /// updated. + void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const; + + void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, + MachineInstr &Inst) const; void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; diff --git a/llvm/test/CodeGen/AMDGPU/add3.ll b/llvm/test/CodeGen/AMDGPU/add3.ll index 3c9f232..d3f9c2d 100644 --- a/llvm/test/CodeGen/AMDGPU/add3.ll +++ b/llvm/test/CodeGen/AMDGPU/add3.ll @@ -222,7 +222,7 @@ define amdgpu_ps float @add3_uniform_vgpr(float inreg %a, float inreg %b, float ; VI-NEXT: v_add_f32_e64 v1, s3, 2.0 ; VI-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-NEXT: v_add_f32_e32 v2, s4, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index cd38bf1..edab417 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s @@ -150,7 +150,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-NEXT: s_mul_i32 s0, s0, s3 @@ -261,7 +261,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_xor_b32 s0, s9, s8 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 ; GFX6-NEXT: s_mul_i32 s1, s1, s3 @@ -1226,7 +1226,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v3 ; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v3, v6, s[6:7] -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GFX6-NEXT: v_mul_hi_u32 v5, s11, v5 ; GFX6-NEXT: v_readfirstlane_b32 s0, v5 ; GFX6-NEXT: s_mul_i32 s0, s0, s15 @@ -1538,7 +1538,7 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -1556,7 +1556,7 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -1871,7 +1871,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_xor_b32 s5, s5, s4 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: s_xor_b32 s8, s4, s2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 ; GFX6-NEXT: s_mul_i32 s2, s2, s3 @@ -1901,7 +1901,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_xor_b32 s7, s7, s6 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX6-NEXT: s_xor_b32 s9, s6, s4 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 ; GFX6-NEXT: v_readfirstlane_b32 s4, v2 ; GFX6-NEXT: s_mul_i32 s4, s4, s5 @@ -1963,7 +1963,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_xor_b32 s1, s1, s0 ; GFX6-NEXT: v_mul_hi_u32 v2, v3, v2 ; GFX6-NEXT: s_xor_b32 s0, s0, s10 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, s1, v2 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s12, v4 ; GFX6-NEXT: v_readfirstlane_b32 s2, v3 @@ -2318,7 +2318,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_add_i32 s5, s5, s8 ; GFX6-NEXT: s_xor_b32 s5, s5, s8 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 ; GFX6-NEXT: s_mul_i32 s2, s2, s4 @@ -2344,7 +2344,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_xor_b32 s5, s5, s4 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: s_sub_i32 s6, s2, s8 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 ; GFX6-NEXT: s_mul_i32 s2, s2, s3 @@ -5345,7 +5345,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX6-NEXT: v_mul_lo_u32 v5, v5, s9 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 @@ -5754,7 +5754,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_readfirstlane_b32 s0, v1 ; GFX6-NEXT: s_mul_i32 s0, s0, s6 @@ -6090,7 +6090,7 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 ; GFX6-NEXT: s_mul_i32 s7, s7, s6 @@ -6178,7 +6178,7 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -6269,7 +6269,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-NEXT: s_mul_i32 s0, s0, s3 @@ -6416,7 +6416,7 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0 ; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -6553,7 +6553,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_xor_b32 s4, s4, s6 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: s_xor_b32 s6, s6, s3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_readfirstlane_b32 s3, v0 ; GFX6-NEXT: s_mul_i32 s3, s3, s2 @@ -6586,7 +6586,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_xor_b32 s5, s5, s8 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v3 ; GFX6-NEXT: s_xor_b32 s4, s8, s4 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_readfirstlane_b32 s6, v1 ; GFX6-NEXT: s_mul_i32 s6, s6, s7 @@ -6797,7 +6797,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 ; GFX6-NEXT: s_mul_i32 s7, s7, s4 @@ -7043,7 +7043,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_xor_b32 s5, s2, s8 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 ; GFX6-NEXT: s_mul_i32 s7, s7, s4 @@ -7221,9 +7221,9 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s9 ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 ; GFX6-NEXT: v_mov_b32_e32 v5, 0x11f -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s9 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc @@ -7571,7 +7571,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; GFX6-NEXT: v_mul_lo_u32 v4, v0, s6 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -7907,8 +7907,8 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s12 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s12 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s12 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x11f ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 @@ -8295,7 +8295,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -8314,9 +8314,9 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s8 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s8 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s8 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -8359,7 +8359,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 2, v0 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, s7 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s6, v8 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc @@ -8607,7 +8607,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 ; GFX6-NEXT: s_addc_u32 s3, s3, s12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -8667,9 +8667,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, s11 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc @@ -8964,7 +8964,7 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -8988,9 +8988,9 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; GFX6-NEXT: s_mov_b32 s9, s8 ; GFX6-NEXT: s_addc_u32 s3, s3, s8 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -9030,7 +9030,7 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 2, v0 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, s3 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s2, v8 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc @@ -9251,7 +9251,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s10, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -9272,9 +9272,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -9366,7 +9366,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v6, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v8, v2, v4 @@ -9387,9 +9387,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 ; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 @@ -9818,7 +9818,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s8 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s8 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -9861,7 +9861,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s4 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_mov_b32_e32 v2, s7 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc @@ -10111,7 +10111,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 ; GFX6-NEXT: s_addc_u32 s3, s3, s10 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -10133,9 +10133,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 ; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -10485,7 +10485,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0 ; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -10505,9 +10505,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -10596,7 +10596,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_ashr_i32 s14, s7, 31 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_mul_lo_u32 v6, v2, v3 ; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v8, v2, v3 @@ -10619,9 +10619,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 ; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index 69e695c..6837ebce 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -1801,7 +1801,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; CISI-NEXT: v_mul_lo_u32 v5, s1, v0 ; CISI-NEXT: v_mul_lo_u32 v4, s0, v0 ; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CISI-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CISI-NEXT: v_mul_hi_u32 v3, v0, v4 ; CISI-NEXT: v_mul_lo_u32 v5, v0, v2 ; CISI-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -1858,9 +1858,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; CISI-NEXT: v_mul_hi_u32 v3, s2, v0 ; CISI-NEXT: v_mul_lo_u32 v4, s3, v0 ; CISI-NEXT: v_mov_b32_e32 v5, s3 -; CISI-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CISI-NEXT: v_mul_lo_u32 v3, s2, v0 -; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CISI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CISI-NEXT: v_sub_i32_e32 v4, vcc, s7, v2 ; CISI-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 ; CISI-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc @@ -1950,7 +1950,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0 ; VI-NEXT: v_mul_lo_u32 v3, s9, v5 ; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 ; VI-NEXT: v_mul_hi_u32 v6, v5, v0 ; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v5, v3, 0 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v1 @@ -1969,8 +1969,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: v_mul_lo_u32 v5, s9, v6 ; VI-NEXT: v_mul_hi_u32 v8, v6, v0 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v0, 0 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v4 -; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v5 ; VI-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v1, 0 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v1, 0 ; VI-NEXT: v_add_u32_e32 v4, vcc, v8, v4 diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll index 44a612e..f3b746f 100644 --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -43,7 +43,7 @@ ; Spill val register -; GCN: v_add_i32_e32 [[VAL:v[0-9]+]], vcc, [[LOAD1]], [[RELOAD_LOAD0]] +; GCN: v_add_i32_e32 [[VAL:v[0-9]+]], vcc, [[RELOAD_LOAD0]], [[LOAD1]] ; GCN: buffer_store_dword [[VAL]], off, s[0:3], 0 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; VMEM: [[ENDIF]]: @@ -110,15 +110,15 @@ endif: ; GCN: [[LOOP:.LBB[0-9]+_[0-9]+]]: ; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload -; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]] +; GCN: v_sub_i32_e32 v[[VAL_LOOP_RELOAD]], vcc, v[[VAL_LOOP_RELOAD]], v{{[0-9]+}} ; GCN: s_cmp_lg_u32 ; VMEM: buffer_store_dword ; VMEM: buffer_store_dword ; VMEM: buffer_store_dword -; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill +; GCN: buffer_store_dword v[[VAL_LOOP_RELOAD]], off, s[0:3], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill ; GCN-NEXT: s_cbranch_scc1 [[LOOP]] -; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; GCN: buffer_store_dword v[[VAL_LOOP_RELOAD]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: [[END]]: ; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload @@ -219,14 +219,14 @@ end: ; GCN: ; %bb.{{[0-9]+}}: ; %if ; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload ; GCN: ds_read_b32 -; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] -; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill +; GCN: v_add_i32_e32 v[[LOAD0_RELOAD]], vcc, v[[LOAD0_RELOAD]], [[ADD:v[0-9]+]] +; GCN: buffer_store_dword v[[LOAD0_RELOAD]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill ; GCN-NEXT: s_branch [[ENDIF:.LBB[0-9]+_[0-9]+]] ; GCN: [[ELSE]]: ; %else ; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload -; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] -; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; GCN: v_sub_i32_e32 v[[LOAD0_RELOAD]], vcc, v[[LOAD0_RELOAD]], v{{[0-9]+}} +; GCN: buffer_store_dword v[[LOAD0_RELOAD]], off, s[0:3], 0 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN-NEXT: s_branch [[FLOW]] ; GCN: [[ENDIF]]: diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index 8fd43c5..9ec9414 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -942,7 +942,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll index d15a60e..458a929 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll @@ -396,7 +396,7 @@ define amdgpu_kernel void @bfe_sext_in_reg_i24(ptr addrspace(1) %out, ptr addrsp ; GCN: buffer_load_dword [[LOAD:v[0-9]+]] ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16 ; GCN: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]] -; GCN: v_add_{{[iu]}}32_e32 [[TMP1:v[0-9]+]], vcc, [[TMP0]], [[BFE]] +; GCN: v_add_{{[iu]}}32_e32 [[TMP1:v[0-9]+]], vcc, [[BFE]], [[TMP0]] ; GCN: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]] ; GCN: buffer_store_dword [[TMP2]] define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index eb1b4f0..3df5344 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -154,9 +154,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v1, v7 ; W64-O0-NEXT: v_mov_b32_e32 v2, v6 @@ -500,9 +497,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15_vgpr16_vgpr17 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v15, v5 ; W64-O0-NEXT: s_waitcnt vmcnt(3) @@ -518,9 +512,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v3, v8 ; W64-O0-NEXT: v_mov_b32_e32 v4, v7 @@ -532,7 +523,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v2, v12 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill @@ -540,7 +530,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v1, v10 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill @@ -1007,9 +996,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v1, v11 ; W64-O0-NEXT: v_mov_b32_e32 v2, v10 @@ -1018,9 +1004,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v14, v7 ; W64-O0-NEXT: v_mov_b32_e32 v15, v6 @@ -1032,7 +1015,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v5, v12 ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir index a0d0c05..6d3f1f1 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir @@ -40,10 +40,7 @@ body: | ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; W64-NEXT: {{ $}} ; W64-NEXT: .1: @@ -87,10 +84,7 @@ body: | ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; W32-NEXT: {{ $}} ; W32-NEXT: .1: @@ -160,10 +154,7 @@ body: | ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; W64-NEXT: {{ $}} ; W64-NEXT: .1: @@ -207,10 +198,7 @@ body: | ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; W32-NEXT: {{ $}} ; W32-NEXT: .1: @@ -280,10 +268,7 @@ body: | ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; W64-NEXT: {{ $}} ; W64-NEXT: .1: @@ -327,10 +312,7 @@ body: | ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; W32-NEXT: {{ $}} ; W32-NEXT: .1: @@ -399,18 +381,15 @@ body: | ; ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; ADDR64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440 ; ADDR64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3 - ; ADDR64-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY9]].sub0, [[COPY1]].sub0, 0, implicit $exec - ; ADDR64-NEXT: %17:vgpr_32, dead %20:sreg_64_xexec = V_ADDC_U32_e64 [[COPY9]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %17, %subreg.sub1 + ; ADDR64-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]].sub0, [[COPY1]].sub0, 0, implicit $exec + ; ADDR64-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY6]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]] @@ -428,18 +407,15 @@ body: | ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; W32-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; W32-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; W32-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; W32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 822173696 ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3 - ; W32-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY9]].sub0, [[COPY1]].sub0, 0, implicit $exec - ; W32-NEXT: %17:vgpr_32, dead %20:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %17, %subreg.sub1 + ; W32-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY6]].sub0, [[COPY1]].sub0, 0, implicit $exec + ; W32-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY6]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]] @@ -484,16 +460,13 @@ body: | ; ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; ADDR64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440 ; ADDR64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3 - ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]].sub0, %subreg.sub0, [[COPY9]].sub1, %subreg.sub1 + ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]].sub0, %subreg.sub0, [[COPY6]].sub1, %subreg.sub1 ; ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]] @@ -512,10 +485,7 @@ body: | ; W64-NO-ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NO-ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NO-ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NO-ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W64-NO-ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W64-NO-ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W64-NO-ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; W64-NO-ADDR64-NEXT: {{ $}} ; W64-NO-ADDR64-NEXT: .1: @@ -559,10 +529,7 @@ body: | ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; W32-NEXT: {{ $}} ; W32-NEXT: .1: diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 648f44c..f214e9a 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -1204,7 +1204,7 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; SI-NEXT: v_mul_lo_u32 v3, v3, v0 ; SI-NEXT: v_mul_lo_u32 v0, v2, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -1230,7 +1230,7 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v2, v0, 0 ; VI-NEXT: v_mul_lo_u32 v0, v3, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir index c845453..df49ff2 100644 --- a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir @@ -97,8 +97,7 @@ body: | ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_LOAD_DWORDX2_IMM1]].sub0, [[REG_SEQUENCE]].sub0, implicit-def $vcc, implicit $exec ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 ; GCN-NEXT: [[V_ADDC_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 0, [[COPY3]], implicit-def $vcc, implicit $vcc, implicit $exec - ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_CO_U32_e32_]], implicit $exec - ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[V_ADDC_U32_e32_]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e32_]], %subreg.sub0, killed [[V_ADDC_U32_e32_]], %subreg.sub1 ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0 ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048576 ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 @@ -374,8 +373,7 @@ body: | ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_LOAD_DWORDX2_IMM1]].sub0, [[REG_SEQUENCE]].sub0, implicit-def $vcc, implicit $exec ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 ; GCN-NEXT: [[V_ADDC_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 0, [[COPY3]], implicit-def $vcc, implicit $vcc, implicit $exec - ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_CO_U32_e32_]], implicit $exec - ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[V_ADDC_U32_e32_]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e32_]], %subreg.sub0, killed [[V_ADDC_U32_e32_]], %subreg.sub1 ; GCN-NEXT: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 killed [[REG_SEQUENCE1]].sub0, 12, implicit $exec ; GCN-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_BRANCH %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index c9a393d..974cb71 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -29,13 +29,13 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v2 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v5 ; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 @@ -47,7 +47,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_mul_lo_u32 v4, v3, v1 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v1, v0 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, v0, v1 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc @@ -55,7 +55,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -74,13 +74,13 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v2 ; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v1 ; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v1 ; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v5, v0 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v5 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v5 ; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 @@ -92,7 +92,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: v_mul_lo_u32 v4, v3, v1 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 -; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v1, v0 +; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v0, v1 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc @@ -100,7 +100,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; TONGA-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2 -; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v2, v0 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; @@ -315,7 +315,7 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 11, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -338,7 +338,7 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 11, v0 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; @@ -410,8 +410,8 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v2 ; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v1 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v5 @@ -428,16 +428,16 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GCN-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v5 ; GCN-NEXT: v_mul_lo_u32 v11, v11, v7 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v6, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v4, v5, v10 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v6 ; GCN-NEXT: v_mul_hi_u32 v6, v7, v11 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v7 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v6 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 ; GCN-NEXT: v_mul_lo_u32 v6, v4, v2 @@ -448,9 +448,9 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; GCN-NEXT: v_subrev_i32_e32 v6, vcc, v2, v0 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, v0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; GCN-NEXT: v_subrev_i32_e32 v7, vcc, v3, v1 +; GCN-NEXT: v_sub_i32_e32 v7, vcc, v1, v3 ; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4 @@ -462,8 +462,8 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v9 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v8, v0 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v9, v1 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -483,8 +483,8 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 ; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v7, v3 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 ; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v1 ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v5 @@ -501,16 +501,16 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5 ; TONGA-NEXT: v_cvt_u32_f32_e32 v7, v7 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; TONGA-NEXT: v_mul_lo_u32 v10, v10, v5 ; TONGA-NEXT: v_mul_lo_u32 v11, v11, v7 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v6, v1 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v6 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4 ; TONGA-NEXT: v_mul_hi_u32 v4, v5, v10 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v6 ; TONGA-NEXT: v_mul_hi_u32 v6, v7, v11 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, v5, v4 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v6, v7 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v7, v6 ; TONGA-NEXT: v_mul_hi_u32 v4, v0, v4 ; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5 ; TONGA-NEXT: v_mul_lo_u32 v6, v4, v2 @@ -521,9 +521,9 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v5 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, v2, v0 +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v0, v2 ; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, v3, v1 +; TONGA-NEXT: v_sub_u32_e32 v7, vcc, v1, v3 ; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] ; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4 @@ -535,8 +535,8 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v9 -; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v8, v0 -; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v9, v1 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v9 ; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; @@ -824,22 +824,22 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v4 ; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5 ; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v10 ; GCN-NEXT: v_xor_b32_e32 v4, v4, v9 ; GCN-NEXT: v_xor_b32_e32 v5, v5, v11 ; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v2 ; GCN-NEXT: v_xor_b32_e32 v15, v8, v9 ; GCN-NEXT: v_xor_b32_e32 v16, v10, v11 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v13 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v10 ; GCN-NEXT: v_cvt_f32_u32_e32 v8, v4 ; GCN-NEXT: v_cvt_f32_u32_e32 v10, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; GCN-NEXT: v_xor_b32_e32 v6, v6, v13 ; GCN-NEXT: v_xor_b32_e32 v17, v12, v13 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v12 @@ -862,12 +862,12 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_mul_hi_u32 v9, v8, v9 ; GCN-NEXT: v_mul_hi_u32 v11, v10, v11 ; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v14 ; GCN-NEXT: v_mul_hi_u32 v13, v12, v13 ; GCN-NEXT: v_xor_b32_e32 v7, v7, v14 ; GCN-NEXT: v_cvt_f32_u32_e32 v18, v7 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, v11, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v11 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v8 ; GCN-NEXT: v_mul_hi_u32 v9, v1, v9 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v12, v13 @@ -885,13 +885,13 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v21 -; GCN-NEXT: v_subrev_i32_e32 v11, vcc, v4, v0 +; GCN-NEXT: v_sub_i32_e32 v11, vcc, v0, v4 ; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] -; GCN-NEXT: v_subrev_i32_e32 v12, vcc, v5, v1 +; GCN-NEXT: v_sub_i32_e32 v12, vcc, v1, v5 ; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v20, s[2:3] ; GCN-NEXT: v_sub_i32_e32 v19, vcc, 0, v7 ; GCN-NEXT: v_add_i32_e32 v22, vcc, 1, v10 -; GCN-NEXT: v_subrev_i32_e32 v13, vcc, v6, v2 +; GCN-NEXT: v_sub_i32_e32 v13, vcc, v2, v6 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] ; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v8 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] @@ -903,27 +903,27 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v3 ; GCN-NEXT: v_mul_hi_u32 v4, v18, v4 ; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GCN-NEXT: v_xor_b32_e32 v3, v3, v8 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v18 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v18, v4 ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v22, s[4:5] ; GCN-NEXT: v_xor_b32_e32 v0, v0, v15 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v16 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v13, s[4:5] -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v15, v0 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v16, v1 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v15 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v16 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v10 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 ; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v5, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v4, v7 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v17 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v17, v2 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v17 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: v_xor_b32_e32 v6, v8, v14 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 -; GCN-NEXT: v_subrev_i32_e32 v8, vcc, v7, v3 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, v3, v7 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc @@ -931,7 +931,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 ; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GCN-NEXT: v_xor_b32_e32 v3, v3, v6 -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NEXT: s_endpgm ; @@ -955,22 +955,22 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v4 ; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 ; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v1 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v11, v5 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v9 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v11 ; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v6 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v8, v0 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v10, v1 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v10 ; TONGA-NEXT: v_xor_b32_e32 v4, v4, v9 ; TONGA-NEXT: v_xor_b32_e32 v5, v5, v11 ; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v2 ; TONGA-NEXT: v_xor_b32_e32 v15, v8, v9 ; TONGA-NEXT: v_xor_b32_e32 v16, v10, v11 -; TONGA-NEXT: v_add_u32_e32 v6, vcc, v13, v6 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v13 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v10 ; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v4 ; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v5 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, v12, v2 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v12 ; TONGA-NEXT: v_xor_b32_e32 v6, v6, v13 ; TONGA-NEXT: v_xor_b32_e32 v17, v12, v13 ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v12 @@ -993,12 +993,12 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mul_hi_u32 v9, v8, v9 ; TONGA-NEXT: v_mul_hi_u32 v11, v10, v11 ; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v7 -; TONGA-NEXT: v_add_u32_e32 v7, vcc, v14, v7 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, v7, v14 ; TONGA-NEXT: v_mul_hi_u32 v13, v12, v13 ; TONGA-NEXT: v_xor_b32_e32 v7, v7, v14 ; TONGA-NEXT: v_cvt_f32_u32_e32 v18, v7 ; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v9 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v11, v10 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, v10, v11 ; TONGA-NEXT: v_mul_hi_u32 v8, v0, v8 ; TONGA-NEXT: v_mul_hi_u32 v9, v1, v9 ; TONGA-NEXT: v_add_u32_e32 v10, vcc, v12, v13 @@ -1016,13 +1016,13 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 ; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v21 -; TONGA-NEXT: v_subrev_u32_e32 v11, vcc, v4, v0 +; TONGA-NEXT: v_sub_u32_e32 v11, vcc, v0, v4 ; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] -; TONGA-NEXT: v_subrev_u32_e32 v12, vcc, v5, v1 +; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v1, v5 ; TONGA-NEXT: v_cndmask_b32_e64 v9, v9, v20, s[2:3] ; TONGA-NEXT: v_sub_u32_e32 v19, vcc, 0, v7 ; TONGA-NEXT: v_add_u32_e32 v22, vcc, 1, v10 -; TONGA-NEXT: v_subrev_u32_e32 v13, vcc, v6, v2 +; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v2, v6 ; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] ; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v8 ; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] @@ -1034,27 +1034,27 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v3 ; TONGA-NEXT: v_mul_hi_u32 v4, v18, v4 ; TONGA-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v8, v3 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v8 ; TONGA-NEXT: v_xor_b32_e32 v3, v3, v8 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v18 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v18, v4 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 ; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v22, s[4:5] ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v15 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v16 ; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v13, s[4:5] -; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v15, v0 -; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v16, v1 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v15 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v16 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v10 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 ; TONGA-NEXT: v_cndmask_b32_e32 v2, v10, v5, vcc ; TONGA-NEXT: v_mul_lo_u32 v5, v4, v7 ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v17 -; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, v17, v2 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v17 ; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 ; TONGA-NEXT: v_xor_b32_e32 v6, v8, v14 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v4 -; TONGA-NEXT: v_subrev_u32_e32 v8, vcc, v7, v3 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v3, v7 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 ; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 ; TONGA-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; TONGA-NEXT: v_xor_b32_e32 v3, v3, v6 -; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 +; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; TONGA-NEXT: s_endpgm ; @@ -2030,7 +2030,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_mul_lo_u32 v1, v3, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 -; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v2, v1 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, v1, v2 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc @@ -2038,7 +2038,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -2078,7 +2078,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 ; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v5, v1 -; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, v2, v1 +; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v1, v2 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc @@ -2086,7 +2086,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0 -; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v0, v1 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm @@ -2254,7 +2254,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read ; GCN-NEXT: v_ashrrev_i32_e32 v3, 12, v3 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -2286,7 +2286,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 12, v3 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, v6, v2 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; TONGA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index db22f2e..0f58c6a 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -37,7 +37,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 ; GCN-NEXT: s_addc_u32 s3, s3, s12 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -97,9 +97,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 ; GCN-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s10, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 ; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc @@ -961,7 +961,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1106,7 +1106,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -1127,9 +1127,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -1155,7 +1155,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc ; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 ; GCN-NEXT: v_mul_hi_u32 v2, s2, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, s2, v0 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, 24, v2 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index d5a1d8f..fb857e4 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -74,7 +74,7 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) ; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; NOSDWA-NEXT: v_subrev_u32_e32 v2, vcc, v2, v3 +; NOSDWA-NEXT: v_sub_u32_e32 v2, vcc, v3, v2 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2 ; NOSDWA-NEXT: s_endpgm ; @@ -88,7 +88,7 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX89-NEXT: v_mov_b32_e32 v0, s0 ; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_subrev_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX89-NEXT: v_sub_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX89-NEXT: flat_store_dword v[0:1], v2 ; GFX89-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll index fc28fd7..855a44b 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -1283,12 +1283,10 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: ; implicit-def: $sgpr4 ; GCN-NEXT: ; implicit-def: $sgpr4 -; GCN-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec ; GCN-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GCN-NEXT: v_mov_b32_e32 v1, v3 ; GCN-NEXT: ; implicit-def: $sgpr4 ; GCN-NEXT: ; implicit-def: $sgpr4 -; GCN-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec ; GCN-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GCN-NEXT: v_mov_b32_e32 v3, v5 ; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index 20c9544..4f94d21 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -83,9 +83,9 @@ define amdgpu_kernel void @shl_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, v7, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, v5, v1 +; SI-NEXT: v_lshl_b32_e32 v3, v3, v7 +; SI-NEXT: v_lshl_b32_e32 v2, v2, v6 +; SI-NEXT: v_lshl_b32_e32 v1, v1, v5 ; SI-NEXT: v_lshl_b32_e32 v0, v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -153,7 +153,7 @@ define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshl_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll index c70b019..6b625ef 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -375,8 +375,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc ; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec - ; CHECK-NEXT: undef %693.sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec - ; CHECK-NEXT: IMAGE_STORE_V4_V2_gfx10 %693, undef %578:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 7) + ; CHECK-NEXT: undef %624.sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec + ; CHECK-NEXT: IMAGE_STORE_V4_V2_gfx10 %624, undef %578:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 7) ; CHECK-NEXT: S_ENDPGM 0 .expVert: %0 = extractelement <31 x i32> %userData, i64 2 diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index 7097f58..8ebdb29 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -85,9 +85,9 @@ define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_ashrrev_i32_e32 v3, v7, v3 -; SI-NEXT: v_ashrrev_i32_e32 v2, v6, v2 -; SI-NEXT: v_ashrrev_i32_e32 v1, v5, v1 +; SI-NEXT: v_ashr_i32_e32 v3, v3, v7 +; SI-NEXT: v_ashr_i32_e32 v2, v2, v6 +; SI-NEXT: v_ashr_i32_e32 v1, v1, v5 ; SI-NEXT: v_ashr_i32_e32 v0, v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 27b551a..46befaa 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -29,7 +29,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -86,8 +86,8 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 ; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 ; GCN-NEXT: v_mul_lo_u32 v0, s12, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 ; GCN-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 @@ -910,7 +910,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -970,8 +970,8 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 ; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 ; GCN-NEXT: v_mul_lo_u32 v0, s12, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s15, v1 ; GCN-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s14, v0 @@ -1145,7 +1145,7 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc ; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v2 @@ -1304,7 +1304,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -1352,7 +1352,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_lo_u32 v1, s9, v0 ; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 ; GCN-NEXT: v_mul_lo_u32 v0, s8, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll index 5d21043..7bc13f4 100644 --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -20,7 +20,7 @@ define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshr_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -138,9 +138,9 @@ define amdgpu_kernel void @lshr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshr_b32_e32 v3, v3, v7 -; SI-NEXT: v_lshrrev_b32_e32 v2, v6, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, v5, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, v4, v0 +; SI-NEXT: v_lshr_b32_e32 v2, v2, v6 +; SI-NEXT: v_lshr_b32_e32 v1, v1, v5 +; SI-NEXT: v_lshr_b32_e32 v0, v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index 67369b0..48d0936 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -1,31 +1,132 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN1 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN2 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN3 %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable -; GCN-LABEL: {{^}}s_sub_i32: -; GCN: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}] -; GCN: s_sub_i32 s{{[0-9]+}}, s[[#LOAD + 2]], s[[#LOAD + 3]] define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { +; GCN1-LABEL: s_sub_i32: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_sub_i32 s0, s2, s3 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: s_sub_i32: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_sub_i32 s2, s2, s3 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: s_sub_i32: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_sub_i32 s2, s2, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: global_store_dword v0, v1, s[0:1] +; GCN3-NEXT: s_endpgm %result = sub i32 %a, %b store i32 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}s_sub_imm_i32: -; GCN: s_load_dword [[A:s[0-9]+]] -; GCN: s_sub_i32 s{{[0-9]+}}, 0x4d2, [[A]] define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { +; GCN1-LABEL: s_sub_imm_i32: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_sub_i32 s4, 0x4d2, s4 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: s_sub_imm_i32: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_sub_i32 s2, 0x4d2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: s_sub_imm_i32: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_sub_i32 s0, 0x4d2, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s0 +; GCN3-NEXT: global_store_dword v0, v1, s[2:3] +; GCN3-NEXT: s_endpgm %result = sub i32 1234, %a store i32 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_sub_i32: -; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN1-LABEL: test_sub_i32: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: test_sub_i32: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; GCN2-NEXT: flat_store_dword v[2:3], v0 +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: test_sub_i32: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v2, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_sub_u32_e32 v0, v0, v1 +; GCN3-NEXT: global_store_dword v2, v0, s[0:1] +; GCN3-NEXT: s_endpgm %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %a = load i32, ptr addrspace(1) %in %b = load i32, ptr addrspace(1) %b_ptr @@ -34,23 +135,101 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ret void } -; GCN-LABEL: {{^}}test_sub_imm_i32: -; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x7b, v{{[0-9]+}} -; GFX9: v_sub_u32_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}} define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN1-LABEL: test_sub_imm_i32: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, 0x7b, v0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: test_sub_imm_i32: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v2, vcc, 0x7b, v2 +; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: test_sub_imm_i32: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dword v1, v0, s[2:3] +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_sub_u32_e32 v1, 0x7b, v1 +; GCN3-NEXT: global_store_dword v0, v1, s[0:1] +; GCN3-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %in %result = sub i32 123, %a store i32 %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_sub_v2i32: -; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} - -; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN1-LABEL: test_sub_v2i32: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: test_sub_v2i32: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v1, vcc, v1, v3 +; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 +; GCN2-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: test_sub_v2i32: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v4, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_sub_u32_e32 v1, v1, v3 +; GCN3-NEXT: v_sub_u32_e32 v0, v0, v2 +; GCN3-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GCN3-NEXT: s_endpgm %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 %a = load <2 x i32>, ptr addrspace(1) %in %b = load <2 x i32>, ptr addrspace(1) %b_ptr @@ -59,17 +238,65 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ret void } -; GCN-LABEL: {{^}}test_sub_v4i32: -; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} - -; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN1-LABEL: test_sub_v4i32: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN1-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GCN1-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: test_sub_v4i32: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: s_add_u32 s2, s2, 16 +; GCN2-NEXT: s_addc_u32 s3, s3, 0 +; GCN2-NEXT: v_mov_b32_e32 v5, s3 +; GCN2-NEXT: v_mov_b32_e32 v4, s2 +; GCN2-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN2-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN2-NEXT: v_mov_b32_e32 v8, s0 +; GCN2-NEXT: v_mov_b32_e32 v9, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 +; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; GCN2-NEXT: v_sub_u32_e32 v1, vcc, v1, v5 +; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 +; GCN2-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: test_sub_v4i32: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v8, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 +; GCN3-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_sub_u32_e32 v3, v7, v3 +; GCN3-NEXT: v_sub_u32_e32 v2, v6, v2 +; GCN3-NEXT: v_sub_u32_e32 v1, v5, v1 +; GCN3-NEXT: v_sub_u32_e32 v0, v4, v0 +; GCN3-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GCN3-NEXT: s_endpgm %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 %a = load <4 x i32>, ptr addrspace(1) %in %b = load <4 x i32>, ptr addrspace(1) %b_ptr @@ -78,10 +305,61 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ret void } -; GCN-LABEL: {{^}}test_sub_i16: -; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, -; GFX89: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN1-LABEL: test_sub_i16: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s10, 0 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN1-NEXT: v_mov_b32_e32 v1, 0 +; GCN1-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 offset:2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 +; GCN1-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: test_sub_i16: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_ushort v4, v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: flat_load_ushort v2, v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_sub_u16_e32 v2, v4, v2 +; GCN2-NEXT: flat_store_short v[0:1], v2 +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: test_sub_i16: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, 0 +; GCN3-NEXT: v_sub_u16_e32 v1, v1, v2 +; GCN3-NEXT: global_store_short v0, v1, s[0:1] +; GCN3-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid %b_ptr = getelementptr i16, ptr addrspace(1) %gep, i32 1 @@ -92,12 +370,61 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ret void } -; GCN-LABEL: {{^}}test_sub_v2i16: -; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_sub_u16_sdwa v{{[0-9]+, v[0-9]+, v[0-9]+}} - -; GFX9: v_pk_sub_i16 define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN1-LABEL: test_sub_v2i16: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s10, 0 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN1-NEXT: v_mov_b32_e32 v1, 0 +; GCN1-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN1-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v2, v3 +; GCN1-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN1-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN1-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: test_sub_v2i16: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_sub_u16_e32 v4, v0, v1 +; GCN2-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GCN2-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN2-NEXT: flat_store_dword v[2:3], v0 +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: test_sub_v2i16: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_pk_sub_i16 v0, v0, v1 +; GCN3-NEXT: global_store_dword v2, v0, s[0:1] +; GCN3-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %tid %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %gep, i16 1 @@ -108,15 +435,72 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ret void } -; GCN-LABEL: {{^}}test_sub_v4i16: -; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_sub_u16_sdwa v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_sub_u16_sdwa v{{[0-9]+, v[0-9]+, v[0-9]+}} - -; GFX9: v_pk_sub_i16 -; GFX9: v_pk_sub_i16 define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN1-LABEL: test_sub_v4i16: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s10, 0 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN1-NEXT: v_mov_b32_e32 v1, 0 +; GCN1-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GCN1-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GCN1-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GCN1-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v5, v7 +; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v6 +; GCN1-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN1-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN1-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN1-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN1-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN1-NEXT: v_or_b32_e32 v0, v0, v3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: test_sub_v4i16: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_sub_u16_e32 v6, v1, v3 +; GCN2-NEXT: v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GCN2-NEXT: v_sub_u16_e32 v3, v0, v2 +; GCN2-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GCN2-NEXT: v_or_b32_e32 v1, v6, v1 +; GCN2-NEXT: v_or_b32_e32 v0, v3, v0 +; GCN2-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: test_sub_v4i16: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_pk_sub_i16 v1, v1, v3 +; GCN3-NEXT: v_pk_sub_i16 v0, v0, v2 +; GCN3-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GCN3-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %gep, i16 1 @@ -127,25 +511,112 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ret void } -; GCN-LABEL: {{^}}s_sub_i64: -; GCN: s_sub_u32 -; GCN: s_subb_u32 define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind { +; GCN1-LABEL: s_sub_i64: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_sub_u32 s4, s4, s6 +; GCN1-NEXT: s_subb_u32 s5, s5, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: s_sub_i64: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_sub_u32 s2, s4, s6 +; GCN2-NEXT: s_subb_u32 s3, s5, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: s_sub_i64: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v2, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_sub_u32 s0, s4, s6 +; GCN3-NEXT: s_subb_u32 s1, s5, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GCN3-NEXT: s_endpgm %result = sub i64 %a, %b store i64 %result, ptr addrspace(1) %out, align 8 ret void } -; GCN-LABEL: {{^}}v_sub_i64: -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 - -; VI: v_sub_u32_e32 -; VI: v_subb_u32_e32 - -; GFX9: v_sub_co_u32_e32 -; GFX9: v_subb_co_u32_e32 define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) nounwind { +; GCN1-LABEL: v_sub_i64: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_mov_b32 s14, 0 +; GCN1-NEXT: s_mov_b32 s15, s11 +; GCN1-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN1-NEXT: v_mov_b32_e32 v1, 0 +; GCN1-NEXT: s_mov_b64 s[2:3], s[14:15] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[12:13], s[6:7] +; GCN1-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; GCN1-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[12:15], 0 addr64 +; GCN1-NEXT: s_mov_b32 s10, -1 +; GCN1-NEXT: s_mov_b32 s8, s4 +; GCN1-NEXT: s_mov_b32 s9, s5 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: v_sub_i64: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: v_sub_i64: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] +; GCN3-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GCN3-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GCN3-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr i64, ptr addrspace(1) %inA, i32 %tid %b_ptr = getelementptr i64, ptr addrspace(1) %inB, i32 %tid @@ -156,22 +627,72 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ret void } -; GCN-LABEL: {{^}}v_test_sub_v2i64: -; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, -; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, -; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, -; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, - -; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, -; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, -; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, -; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, - -; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, -; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, -; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, -; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { +; GCN1-LABEL: v_test_sub_v2i64: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_mov_b32 s14, 0 +; GCN1-NEXT: s_mov_b32 s15, s11 +; GCN1-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GCN1-NEXT: v_mov_b32_e32 v5, 0 +; GCN1-NEXT: s_mov_b64 s[2:3], s[14:15] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[12:13], s[6:7] +; GCN1-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 +; GCN1-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[12:15], 0 addr64 +; GCN1-NEXT: s_mov_b32 s10, -1 +; GCN1-NEXT: s_mov_b32 s8, s4 +; GCN1-NEXT: s_mov_b32 s9, s5 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v6, v2 +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc +; GCN1-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: v_test_sub_v2i64: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, s0, v2 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc +; GCN2-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN2-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: v_test_sub_v2i64: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: v_lshlrev_b32_e32 v8, 4, v0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] +; GCN3-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] +; GCN3-NEXT: v_mov_b32_e32 v8, 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 +; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc +; GCN3-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; GCN3-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GCN3-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inA, i32 %tid %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inB, i32 %tid @@ -182,34 +703,104 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ret void } -; GCN-LABEL: {{^}}v_test_sub_v4i64: -; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, -; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, -; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, -; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, -; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, -; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, -; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, -; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, - -; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, -; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, -; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, -; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, -; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, -; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, -; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, -; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, - -; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, -; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, -; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, -; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, -; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, -; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, -; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, -; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { +; GCN1-LABEL: v_test_sub_v4i64: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_mov_b32 s14, 0 +; GCN1-NEXT: s_mov_b32 s15, s11 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[12:13], s[6:7] +; GCN1-NEXT: v_lshlrev_b32_e32 v12, 5, v0 +; GCN1-NEXT: v_mov_b32_e32 v13, 0 +; GCN1-NEXT: s_mov_b64 s[2:3], s[14:15] +; GCN1-NEXT: buffer_load_dwordx4 v[0:3], v[12:13], s[12:15], 0 addr64 +; GCN1-NEXT: buffer_load_dwordx4 v[4:7], v[12:13], s[0:3], 0 addr64 +; GCN1-NEXT: buffer_load_dwordx4 v[8:11], v[12:13], s[0:3], 0 addr64 offset:16 +; GCN1-NEXT: buffer_load_dwordx4 v[12:15], v[12:13], s[12:15], 0 addr64 offset:16 +; GCN1-NEXT: s_mov_b32 s10, -1 +; GCN1-NEXT: s_mov_b32 s8, s4 +; GCN1-NEXT: s_mov_b32 s9, s5 +; GCN1-NEXT: s_waitcnt vmcnt(2) +; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v6, vcc, v14, v10 +; GCN1-NEXT: v_subb_u32_e32 v7, vcc, v15, v11, vcc +; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 +; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v13, v9, vcc +; GCN1-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN1-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: v_test_sub_v4i64: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_add_u32_e32 v8, vcc, s6, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_add_u32_e32 v12, vcc, s0, v0 +; GCN2-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dwordx4 v[0:3], v[8:9] +; GCN2-NEXT: flat_load_dwordx4 v[4:7], v[12:13] +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 16, v8 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN2-NEXT: v_add_u32_e32 v12, vcc, 16, v12 +; GCN2-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GCN2-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN2-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; GCN2-NEXT: v_mov_b32_e32 v17, s5 +; GCN2-NEXT: v_mov_b32_e32 v16, s4 +; GCN2-NEXT: s_add_u32 s0, s4, 16 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_waitcnt vmcnt(2) +; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GCN2-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_sub_u32_e32 v6, vcc, v10, v14 +; GCN2-NEXT: v_subb_u32_e32 v7, vcc, v11, v15, vcc +; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v8, v12 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v9, v13, vcc +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: v_test_sub_v4i64: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: v_lshlrev_b32_e32 v16, 5, v0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; GCN3-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] +; GCN3-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:16 +; GCN3-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v16, 0 +; GCN3-NEXT: s_waitcnt vmcnt(2) +; GCN3-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 +; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc +; GCN3-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_sub_co_u32_e32 v6, vcc, v10, v14 +; GCN3-NEXT: v_subb_co_u32_e32 v7, vcc, v11, v15, vcc +; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v8, v12 +; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v9, v13, vcc +; GCN3-NEXT: global_store_dwordx4 v16, v[4:7], s[4:5] offset:16 +; GCN3-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] +; GCN3-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inA, i32 %tid %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inB, i32 %tid @@ -220,18 +811,44 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ret void } -; Make sure the VOP3 form of sub is initially selected. Otherwise pair -; of opies from/to VCC would be necessary - -; GCN-LABEL: {{^}}sub_select_vop3: -; SI: v_subrev_i32_e64 v0, s[0:1], s0, v0 -; VI: v_subrev_u32_e64 v0, s[0:1], s0, v0 -; GFX9: v_subrev_u32_e32 v0, s0, v0 - -; GCN: ; def vcc -; GCN: ds_write_b32 -; GCN: ; use vcc define amdgpu_ps void @sub_select_vop3(i32 inreg %s, i32 %v) { +; GCN1-LABEL: sub_select_vop3: +; GCN1: ; %bb.0: +; GCN1-NEXT: v_subrev_i32_e64 v0, s[0:1], s0, v0 +; GCN1-NEXT: s_mov_b32 m0, -1 +; GCN1-NEXT: ;;#ASMSTART +; GCN1-NEXT: ; def vcc +; GCN1-NEXT: ;;#ASMEND +; GCN1-NEXT: ds_write_b32 v0, v0 +; GCN1-NEXT: ;;#ASMSTART +; GCN1-NEXT: ; use vcc +; GCN1-NEXT: ;;#ASMEND +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: sub_select_vop3: +; GCN2: ; %bb.0: +; GCN2-NEXT: v_subrev_u32_e64 v0, s[0:1], s0, v0 +; GCN2-NEXT: s_mov_b32 m0, -1 +; GCN2-NEXT: ;;#ASMSTART +; GCN2-NEXT: ; def vcc +; GCN2-NEXT: ;;#ASMEND +; GCN2-NEXT: ds_write_b32 v0, v0 +; GCN2-NEXT: ;;#ASMSTART +; GCN2-NEXT: ; use vcc +; GCN2-NEXT: ;;#ASMEND +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: sub_select_vop3: +; GCN3: ; %bb.0: +; GCN3-NEXT: v_subrev_u32_e32 v0, s0, v0 +; GCN3-NEXT: ;;#ASMSTART +; GCN3-NEXT: ; def vcc +; GCN3-NEXT: ;;#ASMEND +; GCN3-NEXT: ds_write_b32 v0, v0 +; GCN3-NEXT: ;;#ASMSTART +; GCN3-NEXT: ; use vcc +; GCN3-NEXT: ;;#ASMEND +; GCN3-NEXT: s_endpgm %vcc = call i64 asm sideeffect "; def vcc", "={vcc}"() %sub = sub i32 %v, %s store i32 %sub, ptr addrspace(3) undef diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index a5b1fa8..21038e7 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -32,7 +32,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; SI-NEXT: v_mul_lo_u32 v3, v2, v1 ; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; SI-NEXT: v_subrev_i32_e32 v3, vcc, v1, v0 +; SI-NEXT: v_sub_i32_e32 v3, vcc, v0, v1 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc @@ -68,7 +68,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; VI-NEXT: v_mul_lo_u32 v3, v2, v1 ; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v3 -; VI-NEXT: v_subrev_u32_e32 v3, vcc, v1, v0 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v0, v1 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc @@ -100,7 +100,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_mul_lo_u32 v5, v4, v1 ; GCN-NEXT: v_add_u32_e32 v6, vcc, 1, v4 ; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v5 -; GCN-NEXT: v_subrev_u32_e32 v5, vcc, v1, v0 +; GCN-NEXT: v_sub_u32_e32 v5, vcc, v0, v1 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc @@ -198,7 +198,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-NEXT: v_mul_lo_u32 v1, s4, v0 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: v_mul_hi_u32 v1, v0, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: v_mul_hi_u32 v0, s2, v0 ; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: s_mul_i32 s0, s0, s3 @@ -231,7 +231,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; VI-NEXT: v_mul_lo_u32 v1, s4, v0 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: v_mul_hi_u32 v1, v0, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; VI-NEXT: v_mul_hi_u32 v0, s2, v0 ; VI-NEXT: v_readfirstlane_b32 s0, v0 ; VI-NEXT: s_mul_i32 s0, s0, s3 @@ -260,7 +260,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s4, v0 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_mul_i32 s4, s4, s3 @@ -372,7 +372,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_mul_lo_u32 v7, v7, v5 ; SI-NEXT: v_mul_hi_u32 v6, v4, v6 ; SI-NEXT: v_mul_hi_u32 v7, v5, v7 -; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; SI-NEXT: v_mul_hi_u32 v4, v0, v4 ; SI-NEXT: v_mul_hi_u32 v5, v1, v5 @@ -384,9 +384,9 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_add_i32_e32 v9, vcc, 1, v5 ; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; SI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; SI-NEXT: v_subrev_i32_e32 v6, vcc, v2, v0 +; SI-NEXT: v_sub_i32_e32 v6, vcc, v0, v2 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; SI-NEXT: v_subrev_i32_e32 v7, vcc, v3, v1 +; SI-NEXT: v_sub_i32_e32 v7, vcc, v1, v3 ; SI-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; SI-NEXT: v_add_i32_e32 v6, vcc, 1, v4 @@ -427,7 +427,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_mul_lo_u32 v7, v7, v5 ; VI-NEXT: v_mul_hi_u32 v6, v4, v6 ; VI-NEXT: v_mul_hi_u32 v7, v5, v7 -; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v4 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v7 ; VI-NEXT: v_mul_hi_u32 v4, v0, v4 ; VI-NEXT: v_mul_hi_u32 v5, v1, v5 @@ -439,9 +439,9 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_add_u32_e32 v9, vcc, 1, v5 ; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; VI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; VI-NEXT: v_subrev_u32_e32 v6, vcc, v2, v0 +; VI-NEXT: v_sub_u32_e32 v6, vcc, v0, v2 ; VI-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; VI-NEXT: v_subrev_u32_e32 v7, vcc, v3, v1 +; VI-NEXT: v_sub_u32_e32 v7, vcc, v1, v3 ; VI-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[2:3] ; VI-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v4 @@ -478,7 +478,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mul_hi_u32 v8, v7, v8 -; GCN-NEXT: v_add_u32_e32 v6, vcc, v9, v6 +; GCN-NEXT: v_add_u32_e32 v6, vcc, v6, v9 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v6 ; GCN-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GCN-NEXT: v_mul_hi_u32 v7, v1, v7 @@ -490,9 +490,9 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_sub_u32_e32 v1, vcc, v1, v10 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; GCN-NEXT: v_subrev_u32_e32 v8, vcc, v2, v0 +; GCN-NEXT: v_sub_u32_e32 v8, vcc, v0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] -; GCN-NEXT: v_subrev_u32_e32 v9, vcc, v3, v1 +; GCN-NEXT: v_sub_u32_e32 v9, vcc, v1, v3 ; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[2:3] ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] ; GCN-NEXT: v_add_u32_e32 v8, vcc, 1, v6 @@ -661,9 +661,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_mul_hi_u32 v13, v12, v13 ; SI-NEXT: v_mul_hi_u32 v15, v14, v15 ; SI-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, v11, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, v10, v11 ; SI-NEXT: v_add_i32_e32 v10, vcc, v12, v13 -; SI-NEXT: v_add_i32_e32 v11, vcc, v15, v14 +; SI-NEXT: v_add_i32_e32 v11, vcc, v14, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_hi_u32 v8, v4, v8 ; SI-NEXT: v_mul_hi_u32 v9, v5, v9 @@ -685,13 +685,13 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 ; SI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 ; SI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 -; SI-NEXT: v_subrev_i32_e32 v12, vcc, v0, v4 +; SI-NEXT: v_sub_i32_e32 v12, vcc, v4, v0 ; SI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] -; SI-NEXT: v_subrev_i32_e32 v13, vcc, v1, v5 +; SI-NEXT: v_sub_i32_e32 v13, vcc, v5, v1 ; SI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] -; SI-NEXT: v_subrev_i32_e32 v14, vcc, v2, v6 +; SI-NEXT: v_sub_i32_e32 v14, vcc, v6, v2 ; SI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] -; SI-NEXT: v_subrev_i32_e32 v15, vcc, v3, v7 +; SI-NEXT: v_sub_i32_e32 v15, vcc, v7, v3 ; SI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7] ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] ; SI-NEXT: v_add_i32_e32 v12, vcc, 1, v8 @@ -756,9 +756,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_mul_hi_u32 v13, v12, v13 ; VI-NEXT: v_mul_hi_u32 v15, v14, v15 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v9 -; VI-NEXT: v_add_u32_e32 v9, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, v10, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, v12, v13 -; VI-NEXT: v_add_u32_e32 v11, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v11, vcc, v14, v15 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_hi_u32 v8, v4, v8 ; VI-NEXT: v_mul_hi_u32 v9, v5, v9 @@ -780,13 +780,13 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 ; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 ; VI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 -; VI-NEXT: v_subrev_u32_e32 v12, vcc, v0, v4 +; VI-NEXT: v_sub_u32_e32 v12, vcc, v4, v0 ; VI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] -; VI-NEXT: v_subrev_u32_e32 v13, vcc, v1, v5 +; VI-NEXT: v_sub_u32_e32 v13, vcc, v5, v1 ; VI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] -; VI-NEXT: v_subrev_u32_e32 v14, vcc, v2, v6 +; VI-NEXT: v_sub_u32_e32 v14, vcc, v6, v2 ; VI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] -; VI-NEXT: v_subrev_u32_e32 v15, vcc, v3, v7 +; VI-NEXT: v_sub_u32_e32 v15, vcc, v7, v3 ; VI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7] ; VI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] ; VI-NEXT: v_add_u32_e32 v12, vcc, 1, v8 @@ -851,9 +851,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_mul_hi_u32 v15, v14, v15 ; GCN-NEXT: v_mul_hi_u32 v17, v16, v17 ; GCN-NEXT: v_add_u32_e32 v10, vcc, v10, v11 -; GCN-NEXT: v_add_u32_e32 v11, vcc, v13, v12 +; GCN-NEXT: v_add_u32_e32 v11, vcc, v12, v13 ; GCN-NEXT: v_add_u32_e32 v12, vcc, v14, v15 -; GCN-NEXT: v_add_u32_e32 v13, vcc, v17, v16 +; GCN-NEXT: v_add_u32_e32 v13, vcc, v16, v17 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_hi_u32 v10, v4, v10 ; GCN-NEXT: v_mul_hi_u32 v11, v5, v11 @@ -875,13 +875,13 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 ; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 -; GCN-NEXT: v_subrev_u32_e32 v14, vcc, v0, v4 +; GCN-NEXT: v_sub_u32_e32 v14, vcc, v4, v0 ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1] -; GCN-NEXT: v_subrev_u32_e32 v15, vcc, v1, v5 +; GCN-NEXT: v_sub_u32_e32 v15, vcc, v5, v1 ; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3] -; GCN-NEXT: v_subrev_u32_e32 v16, vcc, v2, v6 +; GCN-NEXT: v_sub_u32_e32 v16, vcc, v6, v2 ; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v19, s[4:5] -; GCN-NEXT: v_subrev_u32_e32 v17, vcc, v3, v7 +; GCN-NEXT: v_sub_u32_e32 v17, vcc, v7, v3 ; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v21, s[6:7] ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[0:1] ; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v10 @@ -1882,7 +1882,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_mul_lo_u32 v3, v1, v0 ; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v1 ; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; SI-NEXT: v_subrev_i32_e32 v3, vcc, v0, v2 +; SI-NEXT: v_sub_i32_e32 v3, vcc, v2, v0 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc @@ -1929,7 +1929,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_mul_lo_u32 v3, v1, v0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 -; VI-NEXT: v_subrev_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v2, v0 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc @@ -1984,7 +1984,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_mul_lo_u32 v5, v4, v3 ; GCN-NEXT: v_add_u32_e32 v6, vcc, 1, v4 ; GCN-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 -; GCN-NEXT: v_subrev_u32_e32 v5, vcc, v3, v2 +; GCN-NEXT: v_sub_u32_e32 v5, vcc, v2, v3 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc @@ -2385,7 +2385,7 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon ; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 ; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -2411,7 +2411,7 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon ; VI-NEXT: v_cvt_i32_f32_e32 v1, v1 ; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -2435,7 +2435,7 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GCN-NEXT: flat_store_byte v[0:1], v2 ; GCN-NEXT: s_endpgm ; @@ -2556,7 +2556,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; SI-NEXT: v_mul_lo_u32 v6, v2, s4 ; SI-NEXT: s_mov_b32 s4, 0x186a0 ; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 -; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; SI-NEXT: v_mul_lo_u32 v5, v2, v4 ; SI-NEXT: v_mul_hi_u32 v7, v2, v6 ; SI-NEXT: v_mul_hi_u32 v8, v2, v4 @@ -2632,7 +2632,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; VI-NEXT: v_mul_lo_u32 v4, v7, s6 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 -; VI-NEXT: v_add_u32_e32 v8, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v8, vcc, v3, v4 ; VI-NEXT: v_mul_hi_u32 v5, v6, v2 ; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0 ; VI-NEXT: v_add_u32_e32 v9, vcc, v5, v3 @@ -2649,7 +2649,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; VI-NEXT: v_mul_lo_u32 v4, v7, s6 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, v3, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, v4, v3 ; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; VI-NEXT: v_mul_hi_u32 v8, v6, v2 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 @@ -2719,7 +2719,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; GCN-NEXT: v_mul_lo_u32 v4, v7, s6 ; GCN-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_add_u32_e32 v8, vcc, v4, v3 +; GCN-NEXT: v_add_u32_e32 v8, vcc, v3, v4 ; GCN-NEXT: v_mul_hi_u32 v5, v6, v2 ; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0 ; GCN-NEXT: v_add_u32_e32 v9, vcc, v5, v3 @@ -2736,7 +2736,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; GCN-NEXT: v_mul_lo_u32 v4, v7, s6 ; GCN-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_add_u32_e32 v5, vcc, v3, v4 +; GCN-NEXT: v_add_u32_e32 v5, vcc, v4, v3 ; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; GCN-NEXT: v_mul_hi_u32 v8, v6, v2 ; GCN-NEXT: v_add_u32_e32 v8, vcc, v8, v3 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 2e3c38d..ba44b1c 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -27,7 +27,7 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -86,9 +86,9 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s9, v0 ; GCN-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s8, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 ; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc @@ -699,7 +699,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-NEXT: v_mul_lo_u32 v6, s8, v1 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v5, v1, v6 ; GCN-NEXT: v_mul_hi_u32 v7, v1, v3 @@ -719,9 +719,9 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-NEXT: v_mul_lo_u32 v3, s8, v2 ; GCN-NEXT: v_mul_hi_u32 v4, s8, v1 ; GCN-NEXT: v_mul_lo_u32 v5, s9, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GCN-NEXT: v_mul_lo_u32 v4, s8, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: v_mul_lo_u32 v7, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v9, v1, v3 @@ -899,7 +899,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -920,9 +920,9 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -948,7 +948,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc ; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 ; GCN-NEXT: v_mul_hi_u32 v2, s2, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, s2, v0 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, 24, v2 @@ -1366,7 +1366,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_hi_u32 v2, v0, s8 ; GCN-NEXT: v_mul_lo_u32 v3, v1, s8 ; GCN-NEXT: v_mul_lo_u32 v4, v0, s8 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 @@ -1405,7 +1405,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, 2, v0 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_mov_b32_e32 v5, s7 ; GCN-NEXT: v_sub_i32_e32 v8, vcc, s6, v8 ; GCN-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc @@ -1536,8 +1536,8 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-NEXT: v_mul_hi_u32 v4, v2, s4 ; GCN-NEXT: v_mul_lo_u32 v5, v3, s4 ; GCN-NEXT: v_mul_lo_u32 v6, v2, s4 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v2, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v2, v4 ; GCN-NEXT: v_mul_hi_u32 v7, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v8, v2, v4 diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll index 21ae4fe..9b9a185 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -175,7 +175,7 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -193,7 +193,7 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_readfirstlane_b32 s6, v0 ; GFX6-NEXT: s_mul_i32 s6, s6, s7 @@ -223,7 +223,7 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -241,7 +241,7 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -362,7 +362,7 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -380,7 +380,7 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -450,7 +450,7 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -468,7 +468,7 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX8-NEXT: v_mul_lo_u32 v0, s4, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 7964302..d6fcda0 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -29,7 +29,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -86,8 +86,8 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 ; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 ; GCN-NEXT: v_mul_lo_u32 v0, s12, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 ; GCN-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 @@ -730,7 +730,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -778,7 +778,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_lo_u32 v1, s7, v0 ; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 ; GCN-NEXT: v_mul_lo_u32 v0, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc @@ -912,8 +912,8 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_hi_u32 v2, v0, s2 ; GCN-NEXT: v_mul_lo_u32 v4, v1, s2 ; GCN-NEXT: v_mul_lo_u32 v3, v0, s2 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 ; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -969,7 +969,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_lo_u32 v1, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v2, v0, 24 ; GCN-NEXT: v_mul_lo_u32 v0, v0, 24 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index 0a21a77..5ba0c3f 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -178,8 +178,6 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI-NEXT: $vgpr0 = COPY killed [[V_ADD_F32_e64_]] ; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 entry: -; %break = icmp sgt i32 %bound, 0 -; br i1 %break, label %for.body, label %for.end br label %for.body for.body: @@ -235,10 +233,10 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %49:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %51:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %53:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %55:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef [[COPY47:%[0-9]+]]:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef [[COPY49:%[0-9]+]]:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef [[COPY51:%[0-9]+]]:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef [[COPY53:%[0-9]+]]:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -251,8 +249,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %57:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 - ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %59:vgpr_32, %bb.4, [[PHI1]], %bb.2 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef [[COPY57:%[0-9]+]]:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef [[COPY59:%[0-9]+]]:vgpr_32, %bb.4, [[PHI1]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -288,8 +286,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %61:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 - ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %63:vgpr_32, %bb.8, [[COPY4]], %bb.6 + ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef [[COPY59:%[0-9]+]]:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef [[COPY61:%[0-9]+]]:vgpr_32, %bb.8, [[COPY4]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 @@ -358,9 +356,9 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %50:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %52:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %54:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef [[COPY50:%[0-9]+]]:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef [[COPY52:%[0-9]+]]:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef [[COPY54:%[0-9]+]]:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -373,7 +371,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef [[COPY56:%[0-9]+]]:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -409,7 +407,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %58:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef [[COPY58:%[0-9]+]]:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 @@ -477,8 +475,8 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: {{ $}} ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4) ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %51, 0, implicit $exec - ; SI-NEXT: %44:vgpr_32, dead %46:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %44, %subreg.sub1 + ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1 ; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1) ; SI-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1) @@ -511,7 +509,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: bb.6.sw.bb18: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %36:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef [[COPY38:%[0-9]+]]:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 ; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1) ; SI-NEXT: S_BRANCH %bb.5 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index f4765a3..8a951ca 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -434,7 +434,6 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline { ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 @@ -586,7 +585,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4 @@ -723,7 +721,6 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 ; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff @@ -742,7 +739,6 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 @@ -770,9 +766,6 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 @@ -878,23 +871,23 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, -1 -; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill @@ -918,24 +911,24 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 @@ -944,140 +937,126 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v43, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s15 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s19 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s21 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s22 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s23 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s25 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s26 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s27 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s29 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: ; kill: def $vgpr45 killed $vgpr45 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr47 killed $vgpr47 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr46 killed $vgpr46 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr44 killed $vgpr44 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v47, s25 +; GFX9-O0-NEXT: v_mov_b32_e32 v46, s26 +; GFX9-O0-NEXT: v_mov_b32_e32 v45, s27 +; GFX9-O0-NEXT: v_mov_b32_e32 v44, s28 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s29 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v25, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v26, v45 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v47 +; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v46 +; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v45 ; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v44 +; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-O0-NEXT: v_mov_b32_e32 v25, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v26, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v27, v47 ; GFX9-O0-NEXT: v_mov_b32_e32 v28, v46 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v29, v45 ; GFX9-O0-NEXT: v_mov_b32_e32 v30, v44 ; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr43 killed $exec @@ -1100,28 +1079,23 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v11 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v0 @@ -1220,27 +1194,27 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, -1 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] -- 2.7.4