.addReg(UnusedCarry, RegState::Define | RegState::Dead);
}
+MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL,
+ Register DestReg,
+ RegScavenger &RS) const {
+ if (ST.hasAddNoCarry())
+ return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
+
+ Register UnusedCarry = RS.scavengeRegister(RI.getBoolRC(), I, 0, false);
+ // TODO: Users need to deal with this.
+ if (!UnusedCarry.isValid())
+ report_fatal_error("failed to scavenge unused carry-out SGPR");
+
+ return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
+ .addReg(UnusedCarry, RegState::Define | RegState::Dead);
+}
+
bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
switch (Opcode) {
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
const DebugLoc &DL,
unsigned DestReg) const;
+ MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL,
+ Register DestReg,
+ RegScavenger &RS) const;
+
static bool isKillTerminator(unsigned Opcode);
const MCInstrDesc &getKillTerminatorFromPseudo(unsigned Opcode) const;
bool SIRegisterInfo::requiresFrameIndexScavenging(
const MachineFunction &MF) const {
- const MachineFrameInfo &MFI = MF.getFrameInfo();
- if (MFI.hasStackObjects())
- return true;
-
- // May need to deal with callee saved registers.
- const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- return !Info->isEntryFunction();
+ // Do not use frame virtual registers. They used to be used for SGPRs, but
+ // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
+ // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
+ // spill.
+ return false;
}
bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
- if (!MFI.hasStackObjects())
- return false;
-
- // The scavenger is used for large frames which may require finding a free
- // register for large offsets.
- if (!isUInt<12>(MFI.getStackSize()))
- return true;
-
- // If using scalar stores, for spills, m0 is needed for the scalar store
- // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual
- // register for it during frame index elimination, so the scavenger is
- // directly needed.
- return MF.getSubtarget<GCNSubtarget>().hasScalarStores() &&
- MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
+ return MFI.hasStackObjects();
}
bool SIRegisterInfo::requiresVirtualBaseRegisters(
if (OnlyToVGPR && !SpillToVGPR)
return false;
- MachineRegisterInfo &MRI = MF->getRegInfo();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
if (SpillToSMEM) {
if (RS->isRegUsed(AMDGPU::M0)) {
- M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ M0CopyReg = RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
.addReg(AMDGPU::M0);
}
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+ // Scavenged temporary VGPR to use. It must be scavenged once for any number
+ // of spilled subregs.
+ Register TmpVGPR;
+
// SubReg carries the "Kill" flag when SubReg == SuperReg.
unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
// Spill SGPR to a frame index.
// TODO: Should VI try to spill to VGPR and then spill to SMEM?
- Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ if (!TmpVGPR.isValid())
+ TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
// TODO: Should VI try to spill to VGPR and then spill to SMEM?
MachineInstrBuilder Mov
- = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+ = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
.addReg(SubReg, SubKillState);
-
// There could be undef components of a spilled super register.
// TODO: Can we detect this and skip the spill?
if (NumSubRegs > 1) {
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
EltSize, MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
- .addReg(TmpReg, RegState::Kill) // src
+ .addReg(TmpVGPR, RegState::Kill) // src
.addFrameIndex(Index) // vaddr
.addReg(MFI->getScratchRSrcReg()) // srrsrc
.addReg(MFI->getStackPtrOffsetReg()) // soffset
RegScavenger *RS,
bool OnlyToVGPR) const {
MachineFunction *MF = MI->getParent()->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock *MBB = MI->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
if (SpillToSMEM) {
if (RS->isRegUsed(AMDGPU::M0)) {
- M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ M0CopyReg = RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
.addReg(AMDGPU::M0);
}
// SubReg carries the "Kill" flag when SubReg == SuperReg.
int64_t FrOffset = FrameInfo.getObjectOffset(Index);
+ Register TmpVGPR;
+
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
Register SubReg =
NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
// Restore SGPR from a stack slot.
// FIXME: We should use S_LOAD_DWORD here for VI.
- Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ if (!TmpVGPR.isValid())
+ TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
unsigned Align = FrameInfo.getObjectAlignment(Index);
MachinePointerInfo PtrInfo
MachineMemOperand::MOLoad, EltSize,
MinAlign(Align, EltSize * i));
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR)
.addFrameIndex(Index) // vaddr
.addReg(MFI->getScratchRSrcReg()) // srsrc
.addReg(MFI->getStackPtrOffsetReg()) // soffset
auto MIB =
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
- .addReg(TmpReg, RegState::Kill);
+ .addReg(TmpVGPR, RegState::Kill);
if (NumSubRegs > 1)
MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
MachineFunction *MF = MI->getParent()->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock *MBB = MI->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
// In an entry function/kernel the offset is already the absolute
// address relative to the frame register.
- Register DiffReg =
- MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register TmpDiffReg =
+ RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
+
+ // If there's no free SGPR, in-place modify the FP
+ Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg;
bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
Register ResultReg = IsCopy ?
MI->getOperand(0).getReg() :
- MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
.addReg(FrameReg)
.addReg(DiffReg);
} else {
Register ScaledReg =
- MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+ // FIXME: Assusmed VGPR use.
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
.addImm(Log2_32(ST.getWavefrontSize()))
.addReg(DiffReg, RegState::Kill);
// TODO: Fold if use instruction is another add of a constant.
if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
- TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
+
+ // FIXME: This can fail
+ TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)
.addImm(Offset)
.addReg(ScaledReg, RegState::Kill)
.addImm(0); // clamp bit
} else {
Register ConstOffsetReg =
- MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
.addImm(Offset);
- TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
+ TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)
.addReg(ConstOffsetReg, RegState::Kill)
.addReg(ScaledReg, RegState::Kill)
.addImm(0); // clamp bit
}
}
+ if (!TmpDiffReg.isValid()) {
+ // Restore the FP.
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg)
+ .addReg(FrameReg)
+ .addReg(MFI->getScratchWaveOffsetReg());
+ }
+
// Don't introduce an extra copy if we're just materializing in a mov.
if (IsCopy)
MI->eraseFromParent();
int64_t Offset = FrameInfo.getObjectOffset(Index);
FIOp.ChangeToImmediate(Offset);
if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
- Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
.addImm(Offset);
FIOp.ChangeToRegister(TmpReg, false, false, true);
; Materialize into a mov. Make sure there isn't an unnecessary copy.
; GCN-LABEL: {{^}}func_mov_fi_i32:
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s33
+; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
; CI-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6
; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB]]
; GCN-LABEL: {{^}}func_mov_fi_i32_offset:
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI: s_sub_u32 [[SUB:s[0-9]+]], s32, s33
-; CI-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6
-
-; CI: s_sub_u32 [[SUB:s[0-9]+]], s32, s33
-; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6
+; CI: s_sub_u32 [[SUB0:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
+; CI-NEXT: s_sub_u32 [[SUB1:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
+; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB1]], 6
+; CI-NEXT: v_lshr_b32_e64 v0, [[SUB0]], 6
; CI-NEXT: v_add_i32_e64 v1, s{{\[[0-9]+:[0-9]+\]}}, 4, [[SCALED]]
; CI-NOT: v_mov
; CI: ds_write_b32 v0, v0
; CI-NEXT: ds_write_b32 v0, v1
-; GFX9: s_sub_u32 [[SUB:s[0-9]+]], s32, s33
-; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB]]
+; GFX9: s_sub_u32 [[SUB0:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
+; GFX9-NEXT: s_sub_u32 [[SUB1:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
+; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB0]]
+; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB1]]
; GFX9-DAG: ds_write_b32 v0, v0
-
-; GFX9-DAG: s_sub_u32 [[SUB:s[0-9]+]], s32, s33
-; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]]
; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
; GFX9-NEXT: ds_write_b32 v0, v0
define void @func_mov_fi_i32_offset() #0 {
; GCN-LABEL: {{^}}func_add_constant_to_fi_i32:
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s33
+; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6
; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]]
; into.
; GCN-LABEL: {{^}}func_other_fi_user_i32:
-; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s33
+; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
; CI-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6
; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr:
; GCN: s_waitcnt
-; GCN-NEXT: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s33
+; GCN-NEXT: s_sub_u32 [[SUB_OFFSET:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
; CI-NEXT: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6
; CI-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]]
; Added offset can't be used with VOP3 add
; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32:
-; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s33
-; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x200
+; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
+; GCN-DAG: s_movk_i32 [[K:s[0-9]+|vcc_lo|vcc_hi]], 0x200
; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6
-; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[SCALED]]
+; CI: v_add_i32_e32 [[VZ:v[0-9]+]], vcc, [[K]], [[SCALED]]
; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]]
; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], [[K]], [[SCALED]]
; GCN-LABEL: {{^}}alloca_ptr_nonentry_block:
; GCN: s_and_saveexec_b64
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
-; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s33
+; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6
; CI-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]]
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck %s
+
+# Frame virtual SGPRs should not be used, as the register scavenger cannot usefully spill them anymore.
+# Spilling is also worse than increment and restore of a frame register. There should be no spills remaining.
+
+---
+name: scavenge_sgpr_pei
+tracksRegLiveness: true
+
+stack:
+ - { id: 0, type: default, size: 4, alignment: 4096 }
+
+machineFunctionInfo:
+ isEntryFunction: false
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ scratchWaveOffsetReg: $sgpr34
+ frameOffsetReg: $sgpr33
+ stackPtrOffsetReg: $sgpr32
+
+body: |
+ bb.0:
+ liveins: $vgpr1
+
+ ; CHECK-LABEL: name: scavenge_sgpr_pei
+ ; CHECK: liveins: $vgpr1
+ ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
+ ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 262080, implicit-def $scc
+ ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294705152, implicit-def $scc
+ ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 524288, implicit-def $scc
+ ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
+ ; CHECK: $sgpr33 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc
+ ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+ ; CHECK: $sgpr33 = S_ADD_U32 $sgpr33, $sgpr34, implicit-def $scc
+ ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
+ ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 524288, implicit-def $scc
+ ; CHECK: $sgpr33 = frame-setup COPY $sgpr27
+ ; CHECK: S_ENDPGM 0, implicit $vcc
+ S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
+ $vgpr0 = V_OR_B32_e32 %stack.0, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
+ S_ENDPGM 0, implicit $vcc
+...
; GCN: ; clobber m0
-; TOSMEM: s_mov_b32 s2, m0
+; TOSMEM: s_mov_b32 vcc_hi, m0
; TOSMEM: s_add_u32 m0, s3, 0x100
; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
-; TOSMEM: s_mov_b32 m0, s2
+; TOSMEM: s_mov_b32 m0, vcc_hi
; TOSMEM: s_mov_b64 exec,
; TOSMEM: s_cbranch_execz
; TOSMEM: s_mov_b32 m0, -1
-; TOSMEM: s_mov_b32 s0, m0
+; TOSMEM: s_mov_b32 vcc_hi, m0
; TOSMEM: s_add_u32 m0, s3, 0x200
; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload
-; TOSMEM: s_mov_b32 m0, s0
+; TOSMEM: s_mov_b32 m0, vcc_hi
; TOSMEM: s_waitcnt lgkmcnt(0)
; TOSMEM: ds_write_b64