return MCRegister();
}
-static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) {
- LivePhysRegs LiveRegs;
- LiveRegs.init(*MRI.getTargetRegisterInfo());
- return findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
+static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
+ LivePhysRegs &LiveRegs,
+ Register &TempSGPR,
+ Optional<int> &FrameIndex,
+ bool IsFP) {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+
+#ifndef NDEBUG
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+#endif
+
+ // We need to save and restore the current FP/BP.
+
+ // 1: If there is already a VGPR with free lanes, use it. We
+ // may already have to pay the penalty for spilling a CSR VGPR.
+ if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
+ int NewFI = FrameInfo.CreateStackObject(4, 4, true, nullptr,
+ TargetStackID::SGPRSpill);
+
+ if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
+ llvm_unreachable("allocate SGPR spill should have worked");
+
+ FrameIndex = NewFI;
+
+ LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+ dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to "
+ << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
+ << '\n');
+ return;
+ }
+
+ // 2: Next, try to save the FP/BP in an unused SGPR.
+ TempSGPR = findScratchNonCalleeSaveRegister(
+ MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
+
+ if (!TempSGPR) {
+ int NewFI = FrameInfo.CreateStackObject(4, 4, true, nullptr,
+ TargetStackID::SGPRSpill);
+
+ if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
+ // 3: There's no free lane to spill, and no free register to save FP/BP,
+ // so we're forced to spill another VGPR to use for the spill.
+ FrameIndex = NewFI;
+ } else {
+ // 4: If all else fails, spill the FP/BP to memory.
+ FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4));
+ }
+
+ LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+ dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
+ << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
+ << '\n';);
+ } else {
+ LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to "
+ << printReg(TempSGPR, TRI) << '\n');
+ }
}
// We need to specially emit stack operations here because a different frame
LiveRegs.addLiveIns(MBB);
if (FuncInfo->SGPRForFPSaveRestoreCopy)
LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+
+ if (FuncInfo->SGPRForBPSaveRestoreCopy)
+ LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy);
} else {
// In epilog.
LiveRegs.init(*ST.getRegisterInfo());
Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
Register FramePtrReg = FuncInfo->getFrameOffsetReg();
+ Register BasePtrReg =
+ TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
LivePhysRegs LiveRegs;
MachineBasicBlock::iterator MBBI = MBB.begin();
DebugLoc DL;
bool HasFP = false;
+ bool HasBP = false;
uint32_t NumBytes = MFI.getStackSize();
uint32_t RoundedSize = NumBytes;
// To avoid clobbering VGPRs in lanes that weren't active on function entry,
TargetStackID::SGPRSpill;
}
+ bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
+ bool SpillBPToMemory = false;
+ // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
+ // Otherwise we are spilling the BP to memory.
+ if (HasBPSaveIndex) {
+ SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
+ TargetStackID::SGPRSpill;
+ }
+
// Emit the copy if we need an FP, and are using a free SGPR to save it.
if (FuncInfo->SGPRForFPSaveRestoreCopy) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
.addReg(FramePtrReg)
.setMIFlag(MachineInstr::FrameSetup);
- // Make the register live throughout the function.
- for (MachineBasicBlock &MBB : MF)
- MBB.addLiveIn(FuncInfo->SGPRForFPSaveRestoreCopy);
+ }
+
+ // Emit the copy if we need a BP, and are using a free SGPR to save it.
+ if (FuncInfo->SGPRForBPSaveRestoreCopy) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
+ FuncInfo->SGPRForBPSaveRestoreCopy)
+ .addReg(BasePtrReg)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // If a copy has been emitted for FP and/or BP, Make the SGPRs
+ // used in the copy instructions live throughout the function.
+ SmallVector<MCPhysReg, 2> TempSGPRs;
+ if (FuncInfo->SGPRForFPSaveRestoreCopy)
+ TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy);
+
+ if (FuncInfo->SGPRForBPSaveRestoreCopy)
+ TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy);
+
+ if (!TempSGPRs.empty()) {
+ for (MachineBasicBlock &MBB : MF) {
+ for (MCPhysReg Reg : TempSGPRs)
+ MBB.addLiveIn(Reg);
+
+ MBB.sortUniqueLiveIns();
+ }
}
for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
FuncInfo->FramePointerSaveIndex.getValue());
}
+ if (HasBPSaveIndex && SpillBPToMemory) {
+ assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex));
+
+ if (!ScratchExecCopy)
+ ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
+
+ MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
+ .addReg(BasePtrReg);
+
+ buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
+ FuncInfo->getScratchRSrcReg(), StackPtrReg,
+ *FuncInfo->BasePointerSaveIndex);
+ }
+
if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
.addReg(Spill[0].VGPR, RegState::Undef);
}
+ // In this case, spill the BP to a reserved VGPR.
+ if (HasBPSaveIndex && !SpillBPToMemory) {
+ const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
+ assert(!MFI.isDeadObjectIndex(BasePtrFI));
+
+ assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
+ FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
+ assert(Spill.size() == 1);
+
+ // Save BP before setting it up.
+ // FIXME: This should respect spillSGPRToVGPR;
+ BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+ Spill[0].VGPR)
+ .addReg(BasePtrReg)
+ .addImm(Spill[0].Lane)
+ .addReg(Spill[0].VGPR, RegState::Undef);
+ }
+
if (TRI.needsStackRealignment(MF)) {
HasFP = true;
const unsigned Alignment = MFI.getMaxAlign().value();
LiveRegs.init(TRI);
LiveRegs.addLiveIns(MBB);
LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+ LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy);
}
Register ScratchSPReg = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
- assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
+ assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy &&
+ ScratchSPReg != FuncInfo->SGPRForBPSaveRestoreCopy);
// s_add_u32 tmp_reg, s32, NumBytes
// s_and_b32 s32, tmp_reg, 0b111...0000
.setMIFlag(MachineInstr::FrameSetup);
FuncInfo->setIsStackRealigned(true);
} else if ((HasFP = hasFP(MF))) {
- // If we need a base pointer, set it up here. It's whatever the value of
- // the stack pointer is at this point. Any variable size objects will be
- // allocated after this, so we can still use the base pointer to reference
- // locals.
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
.addReg(StackPtrReg)
.setMIFlag(MachineInstr::FrameSetup);
}
+ // If we need a base pointer, set it up here. It's whatever the value of
+ // the stack pointer is at this point. Any variable size objects will be
+ // allocated after this, so we can still use the base pointer to reference
+ // the incoming arguments.
+ if ((HasBP = TRI.hasBasePointer(MF))) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
+ .addReg(StackPtrReg)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
if (HasFP && RoundedSize != 0) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
.addReg(StackPtrReg)
assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy &&
!FuncInfo->FramePointerSaveIndex)) &&
"Saved FP but didn't need it");
+
+ assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy ||
+ FuncInfo->BasePointerSaveIndex)) &&
+ "Needed to save BP but didn't save it anywhere");
+
+ assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy &&
+ !FuncInfo->BasePointerSaveIndex)) &&
+ "Saved BP but didn't need it");
}
void SIFrameLowering::emitEpilogue(MachineFunction &MF,
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
LivePhysRegs LiveRegs;
DebugLoc DL;
: NumBytes;
const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
const Register FramePtrReg = FuncInfo->getFrameOffsetReg();
+ const Register BasePtrReg =
+ TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
bool SpillFPToMemory = false;
TargetStackID::SGPRSpill;
}
+ bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
+ bool SpillBPToMemory = false;
+ if (HasBPSaveIndex) {
+ SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
+ TargetStackID::SGPRSpill;
+ }
+
if (RoundedSize != 0 && hasFP(MF)) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
.addReg(StackPtrReg)
.setMIFlag(MachineInstr::FrameSetup);
}
+ if (FuncInfo->SGPRForBPSaveRestoreCopy) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
+ .addReg(FuncInfo->SGPRForBPSaveRestoreCopy)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
Register ScratchExecCopy;
if (HasFPSaveIndex) {
const int FI = FuncInfo->FramePointerSaveIndex.getValue();
}
}
+ if (HasBPSaveIndex) {
+ const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
+ assert(!MFI.isDeadObjectIndex(BasePtrFI));
+ if (SpillBPToMemory) {
+ if (!ScratchExecCopy)
+ ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
+
+ MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+ buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
+ FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
+ .addReg(TempVGPR, RegState::Kill);
+ } else {
+ // Reload from VGPR spill.
+ assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
+ FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
+ assert(Spill.size() == 1);
+ BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+ BasePtrReg)
+ .addReg(Spill[0].VGPR)
+ .addImm(Spill[0].Lane);
+ }
+ }
+
for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg :
FuncInfo->getSGPRSpillVGPRs()) {
if (!Reg.FI.hasValue())
#ifndef NDEBUG
static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
- Optional<int> FramePointerSaveIndex) {
+ Optional<int> FramePointerSaveIndex,
+ Optional<int> BasePointerSaveIndex) {
for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
I != E; ++I) {
if (!MFI.isDeadObjectIndex(I) &&
MFI.getStackID(I) == TargetStackID::SGPRSpill &&
- FramePointerSaveIndex && I != FramePointerSaveIndex) {
+ ((FramePointerSaveIndex && I != FramePointerSaveIndex) ||
+ (BasePointerSaveIndex && I != BasePointerSaveIndex))) {
return false;
}
}
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
FuncInfo->removeDeadFrameIndices(MFI);
- assert(allSGPRSpillsAreDead(MFI, None) &&
+ assert(allSGPRSpillsAreDead(MFI, None, None) &&
"SGPR spill should have been removed in SILowerSGPRSpills");
// FIXME: The other checks should be redundant with allStackObjectsAreDead,
for (auto SSpill : MFI->getSGPRSpillVGPRs())
SavedVGPRs.reset(SSpill.VGPR);
- const bool HasFP = WillHaveFP || hasFP(MF);
- if (!HasFP)
- return;
-
- // We need to save and restore the current FP.
-
- // 1: If there is already a VGPR with free lanes, use it. We
- // may already have to pay the penalty for spilling a CSR VGPR.
- if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
- int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
- TargetStackID::SGPRSpill);
-
- if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
- llvm_unreachable("allocate SGPR spill should have worked");
-
- MFI->FramePointerSaveIndex = NewFI;
+ LivePhysRegs LiveRegs;
+ LiveRegs.init(*TRI);
- LLVM_DEBUG(
- auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
- dbgs() << "Spilling FP to " << printReg(Spill.VGPR, TRI)
- << ':' << Spill.Lane << '\n');
- return;
+ if (WillHaveFP || hasFP(MF)) {
+ getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy,
+ MFI->FramePointerSaveIndex, true);
}
- // 2: Next, try to save the FP in an unused SGPR.
- MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo());
-
- if (!MFI->SGPRForFPSaveRestoreCopy) {
- int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
- TargetStackID::SGPRSpill);
-
- if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
- // 3: There's no free lane to spill, and no free register to save FP, so
- // we're forced to spill another VGPR to use for the spill.
- MFI->FramePointerSaveIndex = NewFI;
- } else {
- // 4: If all else fails, spill the FP to memory.
- MFI->FramePointerSaveIndex =
- FrameInfo.CreateSpillStackObject(4, Align(4));
- }
-
- LLVM_DEBUG(
- auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
- dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI)
- << ':' << Spill.Lane << '\n';);
- } else {
- LLVM_DEBUG(dbgs() << "Saving FP with copy to " <<
- printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n');
+ if (TRI->hasBasePointer(MF)) {
+ if (MFI->SGPRForFPSaveRestoreCopy)
+ LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy);
+ getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy,
+ MFI->BasePointerSaveIndex, false);
}
}
return true; // Early exit if no callee saved registers are modified!
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
- if (!FuncInfo->SGPRForFPSaveRestoreCopy)
+ if (!FuncInfo->SGPRForFPSaveRestoreCopy &&
+ !FuncInfo->SGPRForBPSaveRestoreCopy)
return false;
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *RI = ST.getRegisterInfo();
+ Register FramePtrReg = FuncInfo->getFrameOffsetReg();
+ Register BasePtrReg = RI->getBaseRegister();
+ unsigned NumModifiedRegs = 0;
+
+ if (FuncInfo->SGPRForFPSaveRestoreCopy)
+ NumModifiedRegs++;
+ if (FuncInfo->SGPRForBPSaveRestoreCopy)
+ NumModifiedRegs++;
+
for (auto &CS : CSI) {
- if (CS.getReg() == FuncInfo->getFrameOffsetReg()) {
- if (FuncInfo->SGPRForFPSaveRestoreCopy)
- CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
- break;
+ if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) {
+ CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+ if (--NumModifiedRegs)
+ break;
+ } else if (CS.getReg() == BasePtrReg &&
+ FuncInfo->SGPRForBPSaveRestoreCopy) {
+ CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy);
+ if (--NumModifiedRegs)
+ break;
}
}
ret void
}
+declare void @extern_func(<32 x i32>, i32) #0
+define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
+; The test forces the stack to be realigned to a new boundary
+; since there is a local object with an alignment of 1024.
+; Should use BP to access the incoming stack arguments.
+; The BP value is saved/restored with a VGPR spill.
+
+; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill:
+; GCN: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2
+; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s34, 3
+; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0
+; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
+
+; GCN: s_mov_b32 s34, s32
+; GCN-NEXT: v_mov_b32_e32 v32, 0
+
+; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
+; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
+; GCN-NEXT: s_add_u32 s32, s32, 0x30000
+
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+
+; GCN: v_readlane_b32 s33, [[VGPR_REG]], 2
+; GCN-NEXT: s_sub_u32 s32, s32, 0x30000
+; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
+ %temp = alloca i32, align 1024, addrspace(5)
+ store volatile i32 0, i32 addrspace(5)* %temp, align 1024
+ call void @extern_func(<32 x i32> %a, i32 %b)
+ ret void
+}
+
+%struct.Data = type { [9 x i32] }
+define i32 @needs_align1024_stack_args_used_inside_loop(%struct.Data addrspace(5)* nocapture readonly byval(%struct.Data) align 8 %arg) local_unnamed_addr #4 {
+; The local object allocation needed an alignment of 1024.
+; Since the function argument is accessed in a loop with an
+; index variable, the base pointer first get loaded into a VGPR
+; and that value should be further referenced to load the incoming values.
+; The BP value will get saved/restored in an SGPR at the prolgoue/epilogue.
+
+; GCN-LABEL: needs_align1024_stack_args_used_inside_loop:
+; GCN: s_mov_b32 [[BP_COPY:s[0-9]+]], s34
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0
+; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
+; GCN-NEXT: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
+; GCN-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NEXT: v_lshrrev_b32_e64 [[VGPR_REG:v[0-9]+]], 6, s34
+; GCN: s_add_u32 s32, s32, 0x30000
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:1024
+; GCN: buffer_load_dword v{{[0-9]+}}, [[VGPR_REG]], s[0:3], 0 offen
+; GCN: v_add_u32_e32 [[VGPR_REG]], vcc, 4, [[VGPR_REG]]
+; GCN: s_sub_u32 s32, s32, 0x30000
+; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
+; GCN-NEXT: s_mov_b32 s34, [[BP_COPY]]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+begin:
+ %local_var = alloca i32, align 1024, addrspace(5)
+ store volatile i32 0, i32 addrspace(5)* %local_var, align 1024
+ br label %loop_body
+
+loop_end: ; preds = %loop_body
+ %idx_next = add nuw nsw i32 %lp_idx, 1
+ %lp_exit_cond = icmp eq i32 %idx_next, 9
+ br i1 %lp_exit_cond, label %exit, label %loop_body
+
+loop_body: ; preds = %loop_end, %begin
+ %lp_idx = phi i32 [ 0, %begin ], [ %idx_next, %loop_end ]
+ %ptr = getelementptr inbounds %struct.Data, %struct.Data addrspace(5)* %arg, i32 0, i32 0, i32 %lp_idx
+ %val = load i32, i32 addrspace(5)* %ptr, align 8
+ %lp_cond = icmp eq i32 %val, %lp_idx
+ br i1 %lp_cond, label %loop_end, label %exit
+
+exit: ; preds = %loop_end, %loop_body
+ %out = phi i32 [ 0, %loop_body ], [ 1, %loop_end ]
+ ret i32 %out
+}
+
+define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 {
+; GCN-LABEL: no_free_scratch_sgpr_for_bp_copy:
+; GCN: ; %bb.0:
+; GCN: v_writelane_b32 [[VGPR_REG:v[0-9]+]], s34, 0
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:128
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; GCN: v_readlane_b32 s34, [[VGPR_REG:v[0-9]+]], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %local_val = alloca i32, align 128, addrspace(5)
+ store volatile i32 %b, i32 addrspace(5)* %local_val, align 128
+ ; Use all clobberable registers, so BP has to spill to a VGPR.
+ call void asm sideeffect "",
+ "~{s0},~{s1},~{s2},~{s3},~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
+ ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
+ ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
+ ,~{vcc_hi}"() #0
+ ret void
+}
+
attributes #0 = { noinline nounwind }
attributes #1 = { noinline nounwind "stackrealign" }
attributes #2 = { noinline nounwind alignstack=4 }
attributes #3 = { noinline nounwind "no-realign-stack" }
+attributes #4 = { noinline nounwind "frame-pointer"="all"}