From 57899934eab18bbcab3482cc3ef862b0a1617ad0 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Tue, 4 Aug 2020 12:20:12 +0900 Subject: [PATCH] [AMDGPU] Make GCNRegBankReassign assign based on subreg banks When scavenging consider the sub-register of the source operand to determine the bank of a candidate register (not just sub0). Without this it is possible to introduce an infinite loop, e.g. $sgpr15_sgpr16_sgpr17 can be assigned for a conflict between $sgpr0 and SGPR_96:sub1. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D84910 --- llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp | 97 ++++++++++++++-------- .../CodeGen/AMDGPU/GlobalISel/insertelement.ll | 74 ++++++++--------- .../CodeGen/AMDGPU/regbank-reassign-wave64.mir | 69 +++++++++++++++ llvm/test/CodeGen/AMDGPU/regbank-reassign.mir | 78 +++++++++++++++++ 4 files changed, 245 insertions(+), 73 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir diff --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp index 98d9716..79b33e2 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp @@ -83,9 +83,10 @@ class GCNRegBankReassign : public MachineFunctionPass { class Candidate { public: - Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks, - unsigned weight) - : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {} + Candidate(MachineInstr *mi, unsigned reg, unsigned subreg, + unsigned freebanks, unsigned weight) + : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks), + Weight(weight) {} bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; } @@ -100,6 +101,7 @@ class GCNRegBankReassign : public MachineFunctionPass { MachineInstr *MI; unsigned Reg; + unsigned SubReg; unsigned FreeBanks; unsigned Weight; }; @@ -162,7 +164,7 @@ private: const MCPhysReg *CSRegs; // Returns bank for a phys reg. - unsigned getPhysRegBank(unsigned Reg) const; + unsigned getPhysRegBank(unsigned Reg, unsigned SubReg) const; // Return a bit set for each register bank used. 4 banks for VGPRs and // 8 banks for SGPRs. @@ -176,7 +178,7 @@ private: // a register chosen from Bank. std::pair analyzeInst(const MachineInstr &MI, unsigned Reg = AMDGPU::NoRegister, - int Bank = -1); + unsigned SubReg = 0, int Bank = -1); // Return true if register is regular VGPR or SGPR or their tuples. // Returns false for special registers like m0, vcc etc. @@ -216,11 +218,12 @@ private: // candidates are collected and added to work list. unsigned computeStallCycles(unsigned SrcReg, unsigned Reg = AMDGPU::NoRegister, - int Bank = -1, bool Collect = false); + unsigned SubReg = 0, int Bank = -1, + bool Collect = false); // Search for a register in Bank unused within LI. // Returns phys reg or NoRegister. - unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const; + unsigned scavengeReg(LiveInterval &LI, unsigned Bank, unsigned SubReg) const; // Try to reassign candidate. Returns number or stall cycles saved. unsigned tryReassign(Candidate &C); @@ -277,15 +280,24 @@ char GCNRegBankReassign::ID = 0; char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; -unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { +unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg, + unsigned SubReg) const { assert(Register::isPhysicalRegister(Reg)); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); unsigned Size = TRI->getRegSizeInBits(*RC); if (Size == 16) Reg = TRI->get32BitRegister(Reg); - else if (Size > 32) - Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + else if (Size > 32) { + if (SubReg) { + const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg); + Reg = TRI->getSubReg(Reg, SubReg); + if (TRI->getRegSizeInBits(*SubRC) > 32) + Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + } else { + Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + } + } if (TRI->hasVGPRs(RC)) { Reg -= AMDGPU::VGPR0; @@ -360,7 +372,7 @@ uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, std::pair GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg, - int Bank) { + unsigned SubReg, int Bank) { unsigned StallCycles = 0; unsigned UsedBanks = 0; @@ -375,26 +387,39 @@ GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg, if (!Op.isReg() || Op.isUndef()) continue; - Register R = Op.getReg(); - if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R))) - continue; + const Register R = Op.getReg(); + const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, R); - unsigned ShiftedBank = Bank; + // Do not compute stalls for AGPRs + if (TRI->hasAGPRs(RC)) + continue; - if (Bank != -1 && R == Reg && Op.getSubReg()) { - unsigned Offset = TRI->getChannelFromSubReg(Op.getSubReg()); + // Do not compute stalls if sub-register covers all banks + if (Op.getSubReg()) { LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()); - if (Offset && Bank < NUM_VGPR_BANKS) { - // If a register spans all banks we cannot shift it to avoid conflict. + if (TRI->hasVGPRs(RC)) { if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS) continue; - ShiftedBank = (Bank + Offset) % NUM_VGPR_BANKS; - } else if (Offset > 1 && Bank >= SGPR_BANK_OFFSET) { - // If a register spans all banks we cannot shift it to avoid conflict. + } else { if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS) continue; + } + } + + unsigned ShiftedBank = Bank; + + if (Bank != -1 && R == Reg && (Op.getSubReg() || SubReg)) { + unsigned RegOffset = + TRI->getChannelFromSubReg(SubReg ? SubReg : (unsigned)AMDGPU::sub0); + unsigned Offset = TRI->getChannelFromSubReg( + Op.getSubReg() ? Op.getSubReg() : (unsigned)AMDGPU::sub0); + if (Bank < NUM_VGPR_BANKS) { + unsigned Shift = ((NUM_VGPR_BANKS + Offset) - RegOffset); + ShiftedBank = (Bank + Shift) % NUM_VGPR_BANKS; + } else if (Bank >= SGPR_BANK_OFFSET) { + unsigned Shift = (NUM_SGPR_BANKS + (Offset >> 1)) - (RegOffset >> 1); ShiftedBank = SGPR_BANK_OFFSET + - (Bank - SGPR_BANK_OFFSET + (Offset >> 1)) % NUM_SGPR_BANKS; + (Bank - SGPR_BANK_OFFSET + Shift) % NUM_SGPR_BANKS; } } @@ -576,17 +601,17 @@ void GCNRegBankReassign::collectCandidates(MachineInstr& MI, unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks); unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks); if (FreeBanks1) - Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight - + ((Size2 > Size1) ? 1 : 0))); + Candidates.push(Candidate(&MI, Reg1, SubReg1, FreeBanks1, + Weight + ((Size2 > Size1) ? 1 : 0))); if (FreeBanks2) - Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight - + ((Size1 > Size2) ? 1 : 0))); + Candidates.push(Candidate(&MI, Reg2, SubReg2, FreeBanks2, + Weight + ((Size1 > Size2) ? 1 : 0))); } } } -unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, - unsigned Reg, int Bank, +unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, unsigned Reg, + unsigned SubReg, int Bank, bool Collect) { unsigned TotalStallCycles = 0; SmallSet Visited; @@ -598,7 +623,7 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, continue; unsigned StallCycles; unsigned UsedBanks; - std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, Bank); + std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, SubReg, Bank); TotalStallCycles += StallCycles; if (Collect) collectCandidates(MI, UsedBanks, StallCycles); @@ -607,8 +632,8 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, return TotalStallCycles; } -unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI, - unsigned Bank) const { +unsigned GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank, + unsigned SubReg) const { const TargetRegisterClass *RC = MRI->getRegClass(LI.reg); unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs : MaxNumSGPRs; @@ -620,7 +645,7 @@ unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI, if (TRI->isSubRegisterEq(Reg, MaxReg)) break; - if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank) + if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg, SubReg) != Bank) continue; for (unsigned I = 0; CSRegs[I]; ++I) @@ -669,7 +694,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) { for (int Bank = 0; Bank < NUM_BANKS; ++Bank) { if (C.FreeBanks & (1 << Bank)) { LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n'); - unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank); + unsigned Stalls = computeStallCycles(C.Reg, C.Reg, C.SubReg, Bank); if (Stalls < OrigStalls) { LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> " << Stalls << '\n'); @@ -683,7 +708,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) { LRM->unassign(LI); while (!BankStalls.empty()) { BankStall BS = BankStalls.pop_back_val(); - unsigned Reg = scavengeReg(LI, BS.Bank); + unsigned Reg = scavengeReg(LI, BS.Bank, C.SubReg); if (Reg == AMDGPU::NoRegister) { LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank) << '\n'); @@ -801,7 +826,7 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) { Candidates.pop_back(); if (LocalCyclesSaved) { removeCandidates(C.Reg); - computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true); + computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true); Candidates.sort(); LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index 8e4a071..b4afc48 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -1492,7 +1492,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double % ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v18 ; MOVREL-NEXT: v_mov_b32_e32 v19, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18 -; MOVREL-NEXT: v_mov_b32_e32 v20, v1 +; MOVREL-NEXT: v_mov_b32_e32 v23, v1 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 3, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 7, v18 @@ -1501,7 +1501,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double % ; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v16, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4 -; MOVREL-NEXT: v_cndmask_b32_e32 v1, v20, v17, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v23, v17, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v17, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v17, s4 @@ -2123,7 +2123,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: v_add_nc_u32_e32 v18, 1, v18 ; MOVREL-NEXT: v_mov_b32_e32 v19, v0 -; MOVREL-NEXT: v_mov_b32_e32 v20, v1 +; MOVREL-NEXT: v_mov_b32_e32 v23, v1 ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v18 @@ -2137,7 +2137,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do ; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v16, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4 -; MOVREL-NEXT: v_cndmask_b32_e32 v1, v20, v17, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v23, v17, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v17, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v17, s4 @@ -4111,7 +4111,7 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec, ; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 4, v16 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v16 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 6, v16 -; MOVREL-NEXT: v_mov_b32_e32 v17, v2 +; MOVREL-NEXT: v_mov_b32_e32 v19, v2 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v16 ; MOVREL-NEXT: v_mov_b32_e32 v18, v3 ; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v14, s2 @@ -4119,7 +4119,7 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec, ; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v14, s4 ; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, v14, s5 ; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, v15, s2 -; MOVREL-NEXT: v_cndmask_b32_e64 v2, v17, v14, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v2, v19, v14, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v15, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v3, v18, v15, s0 ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo @@ -4251,42 +4251,42 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg ; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s8, s10 ; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: v_mov_b32_e32 v18, s15 -; MOVREL-NEXT: v_mov_b32_e32 v17, s14 -; MOVREL-NEXT: v_mov_b32_e32 v16, s13 -; MOVREL-NEXT: v_mov_b32_e32 v15, s12 -; MOVREL-NEXT: v_mov_b32_e32 v14, s11 -; MOVREL-NEXT: v_mov_b32_e32 v13, s10 -; MOVREL-NEXT: v_mov_b32_e32 v12, s9 -; MOVREL-NEXT: v_mov_b32_e32 v11, s8 -; MOVREL-NEXT: v_mov_b32_e32 v10, s7 -; MOVREL-NEXT: v_mov_b32_e32 v9, s6 -; MOVREL-NEXT: v_mov_b32_e32 v8, s5 -; MOVREL-NEXT: v_mov_b32_e32 v7, s4 -; MOVREL-NEXT: v_mov_b32_e32 v6, s3 -; MOVREL-NEXT: v_mov_b32_e32 v5, s2 -; MOVREL-NEXT: v_mov_b32_e32 v4, s1 -; MOVREL-NEXT: v_mov_b32_e32 v3, s0 +; MOVREL-NEXT: v_mov_b32_e32 v20, s15 +; MOVREL-NEXT: v_mov_b32_e32 v19, s14 +; MOVREL-NEXT: v_mov_b32_e32 v18, s13 +; MOVREL-NEXT: v_mov_b32_e32 v17, s12 +; MOVREL-NEXT: v_mov_b32_e32 v16, s11 +; MOVREL-NEXT: v_mov_b32_e32 v15, s10 +; MOVREL-NEXT: v_mov_b32_e32 v14, s9 +; MOVREL-NEXT: v_mov_b32_e32 v13, s8 +; MOVREL-NEXT: v_mov_b32_e32 v12, s7 +; MOVREL-NEXT: v_mov_b32_e32 v11, s6 +; MOVREL-NEXT: v_mov_b32_e32 v10, s5 +; MOVREL-NEXT: v_mov_b32_e32 v9, s4 +; MOVREL-NEXT: v_mov_b32_e32 v8, s3 +; MOVREL-NEXT: v_mov_b32_e32 v7, s2 +; MOVREL-NEXT: v_mov_b32_e32 v6, s1 +; MOVREL-NEXT: v_mov_b32_e32 v5, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 1 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v3, v0, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v4, v5, v0, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v5, v6, v1, s0 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v5, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v6, v1, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v4, v7, v0, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 2 +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v8, v1, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 3 ; MOVREL-NEXT: v_readfirstlane_b32 s2, v4 -; MOVREL-NEXT: v_readfirstlane_b32 s3, v5 -; MOVREL-NEXT: v_cndmask_b32_e32 v6, v7, v0, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v7, v8, v1, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v8, v9, v0, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v9, v10, v1, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v0, v11, v0, s1 -; MOVREL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s1 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v10, v1, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v11, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v12, v1, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v0, v13, v0, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v1, v14, v1, s1 ; MOVREL-NEXT: v_readfirstlane_b32 s0, v2 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v3 +; MOVREL-NEXT: v_readfirstlane_b32 s3, v5 ; MOVREL-NEXT: v_readfirstlane_b32 s4, v6 ; MOVREL-NEXT: v_readfirstlane_b32 s5, v7 ; MOVREL-NEXT: v_readfirstlane_b32 s6, v8 @@ -4448,7 +4448,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec, ; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_s: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 -; MOVREL-NEXT: v_mov_b32_e32 v13, v2 +; MOVREL-NEXT: v_mov_b32_e32 v15, v2 ; MOVREL-NEXT: v_mov_b32_e32 v14, v3 ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo @@ -4457,7 +4457,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec, ; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 ; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v15, v10, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 ; MOVREL-NEXT: v_readfirstlane_b32 s3, v3 ; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo @@ -4514,7 +4514,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec, ; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_v: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 -; MOVREL-NEXT: v_mov_b32_e32 v13, v2 +; MOVREL-NEXT: v_mov_b32_e32 v15, v2 ; MOVREL-NEXT: v_mov_b32_e32 v14, v3 ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo @@ -4522,7 +4522,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec, ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 ; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v15, v10, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12 ; MOVREL-NEXT: v_readfirstlane_b32 s2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir new file mode 100644 index 0000000..49d6a9a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir @@ -0,0 +1,69 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s + + +# Test that subreg reassignments are correctly handled when whole register also +# conflicts. If this is mishandled stall counts will be incorrect and cause an +# infinite loop. +# GCN-LABEL: vgpr64_mixed_use{{$}} +# GCN: $vgpr0_vgpr1 = IMPLICIT_DEF +# GCN: $vgpr4_vgpr5 = IMPLICIT_DEF +# GCN: $vcc = IMPLICIT_DEF +# GCN: $vgpr2_vgpr3 = IMPLICIT_DEF +# GCN: $vgpr6_vgpr7 = IMPLICIT_DEF +# GCN: $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF +# GCN: $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF +# GCN: $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF +# GCN: $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF +# GCN: $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF +# GCN: $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF +# GCN: $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF +# GCN: $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF +# GCN: $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF +# GCN: $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF +# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr5, $vcc, implicit $exec +# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr4, killed $vcc, implicit $exec +# GCN: $sgpr0_sgpr1 = V_CMP_LT_U64_e64 $vgpr4_vgpr5, $vgpr0_vgpr1, implicit $exec +--- +name: vgpr64_mixed_use +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64, preferred-register: '$vgpr0_vgpr1' } + - { id: 1, class: vreg_64, preferred-register: '$vgpr4_vgpr5' } + - { id: 2, class: sreg_64_xexec, preferred-register: '$vcc' } + - { id: 3, class: vgpr_32 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: vreg_64, preferred-register: '$vgpr2_vgpr3' } + - { id: 7, class: vreg_64, preferred-register: '$vgpr6_vgpr7' } + - { id: 8, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' } + - { id: 9, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' } + - { id: 10, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' } + - { id: 11, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' } + - { id: 12, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' } + - { id: 13, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' } + - { id: 14, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' } + - { id: 15, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' } + - { id: 16, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' } + - { id: 17, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %2 = IMPLICIT_DEF + %6 = IMPLICIT_DEF + %7 = IMPLICIT_DEF + %8 = IMPLICIT_DEF + %9 = IMPLICIT_DEF + %10 = IMPLICIT_DEF + %11 = IMPLICIT_DEF + %12 = IMPLICIT_DEF + %13 = IMPLICIT_DEF + %14 = IMPLICIT_DEF + %15 = IMPLICIT_DEF + %16 = IMPLICIT_DEF + %17 = IMPLICIT_DEF + %3 = V_CNDMASK_B32_e64 0, %0.sub1, 0, %1.sub1, %2, implicit $exec + %4 = V_CNDMASK_B32_e64 0, %0.sub0, 0, %1.sub0, %2, implicit $exec + %5 = V_CMP_LT_U64_e64 %1, %0, implicit $exec + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir index 0020e17..2078d8c 100644 --- a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir +++ b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir @@ -494,3 +494,81 @@ body: | %2 = V_AND_B32_e32 %1, %0, implicit $exec S_ENDPGM 0 ... + +# Test that bank of subreg is considered during scavenging. +# If handled incorrectly an infinite loop occurs. +# GCN-LABEL: s0_vs_s15_16_17_sub1{{$}} +# GCN: S_AND_B32 renamable $sgpr13, $sgpr0, +--- +name: s0_vs_s15_16_17_sub1 +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_96, preferred-register: '$sgpr15_sgpr16_sgpr17' } + - { id: 1, class: sgpr_32 } +body: | + bb.0: + %0 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + %1 = S_AND_B32 %0.sub1, $sgpr0, implicit-def $scc + S_ENDPGM 0 +... + +# Test that the size of subreg is correctly handled in bank calculation. +# If handled incorrectly an infinite loop occurs. +# GCN-LABEL: vgpr_sub_dependence{{$}} +# GCN: $vgpr9_vgpr10_vgpr11_vgpr12 = IMPLICIT_DEF +# GCN: $vgpr16_vgpr17 = IMPLICIT_DEF +# GCN: $vgpr14_vgpr15 = IMPLICIT_DEF +# GCN: $vgpr0_vgpr1 = IMPLICIT_DEF +# GCN: $vgpr7_vgpr8 = IMPLICIT_DEF +# GCN: $vgpr3_vgpr4_vgpr5_vgpr6 = IMPLICIT_DEF +# GCN: $vgpr18_vgpr19 = IMPLICIT_DEF +# GCN: $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF +# GCN: $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF +# GCN: $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF +# GCN: $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF +# GCN: $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF +# GCN: $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF +# GCN: $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF +# GCN: $vgpr0_vgpr1 = V_ADD_F64 0, $vgpr11_vgpr12, 0, killed $vgpr16_vgpr17, 0, 0, implicit $mode, implicit $exec +# GCN: $vgpr0_vgpr1 = V_ADD_F64 0, $vgpr9_vgpr10, 0, killed $vgpr14_vgpr15, 0, 0, implicit $mode, implicit $exec +--- +name: vgpr_sub_dependence +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_128, preferred-register: '$vgpr10_vgpr11_vgpr12_vgpr13' } + - { id: 1, class: vreg_64, preferred-register: '$vgpr16_vgpr17' } + - { id: 2, class: vreg_64, preferred-register: '$vgpr14_vgpr15' } + - { id: 3, class: vreg_64 } + - { id: 4, class: vreg_64 } + - { id: 5, class: vreg_64, preferred-register: '$vgpr0_vgpr1' } + - { id: 6, class: vreg_64, preferred-register: '$vgpr7_vgpr8' } + - { id: 7, class: vreg_128, preferred-register: '$vgpr3_vgpr4_vgpr5_vgpr6' } + - { id: 8, class: vreg_64, preferred-register: '$vgpr18_vgpr19' } + - { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' } + - { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' } + - { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' } + - { id: 12, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' } + - { id: 13, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' } + - { id: 14, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' } + - { id: 15, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %2 = IMPLICIT_DEF + %5 = IMPLICIT_DEF + %6 = IMPLICIT_DEF + %7 = IMPLICIT_DEF + %8 = IMPLICIT_DEF + %9 = IMPLICIT_DEF + %10 = IMPLICIT_DEF + %11 = IMPLICIT_DEF + %12 = IMPLICIT_DEF + %13 = IMPLICIT_DEF + %14 = IMPLICIT_DEF + %15 = IMPLICIT_DEF + %3 = V_ADD_F64 0, %0.sub2_sub3:vreg_128, 0, %1:vreg_64, 0, 0, implicit $mode, implicit $exec + %4 = V_ADD_F64 0, %0.sub0_sub1:vreg_128, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... -- 2.7.4