using namespace llvm;
-static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
- const MachineFrameInfo &MFI) {
- return FuncInfo->hasSpilledSGPRs() &&
- (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects());
-}
-
static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF,
const SIRegisterInfo *TRI) {
return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
-
// Add wave offset in bytes to private base offset.
// See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
// We need to insert initialization of the scratch resource descriptor.
unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
- assert(ScratchRsrcReg != AMDGPU::NoRegister);
+ if (ScratchRsrcReg == AMDGPU::NoRegister)
+ return AMDGPU::NoRegister;
if (ST.hasSGPRInitBug() ||
ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
MachineRegisterInfo &MRI = MF.getRegInfo();
- unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
+ unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
+ ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(MF, TRI);
+ AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
+
// Skip the last 2 elements because the last one is reserved for VCC, and
// this is the 2nd to last element already.
- for (MCPhysReg Reg : getAllSGPR128(MF, TRI).drop_back(2).slice(NumPreloaded)) {
+ for (MCPhysReg Reg : AllSGPR128s) {
// Pick the first unallocated one. Make sure we don't clobber the other
// reserved input we needed.
- if (!MRI.isPhysRegUsed(Reg)) {
- assert(MRI.isAllocatable(Reg));
+ if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
+ //assert(MRI.isAllocatable(Reg));
MRI.replaceRegWith(ScratchRsrcReg, Reg);
MFI->setScratchRSrcReg(Reg);
return Reg;
unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
MachineRegisterInfo &MRI = MF.getRegInfo();
+
unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+ ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(MF, TRI);
+ if (NumPreloaded > AllSGPRs.size())
+ return ScratchWaveOffsetReg;
+
+ AllSGPRs = AllSGPRs.slice(NumPreloaded);
+
// We need to drop register from the end of the list that we cannot use
// for the scratch wave offset.
// + 2 s102 and s103 do not exist on VI.
// are no other free SGPRs, then the value will stay in this register.
// ----
// 13
- for (MCPhysReg Reg : getAllSGPRs(MF, TRI).drop_back(13).slice(NumPreloaded)) {
+ if (AllSGPRs.size() < 13)
+ return ScratchWaveOffsetReg;
+
+ for (MCPhysReg Reg : AllSGPRs.drop_back(13)) {
// Pick the first unallocated SGPR. Be careful not to pick an alias of the
// scratch descriptor, since we haven’t added its uses yet.
if (!MRI.isPhysRegUsed(Reg)) {
if (ST.debuggerEmitPrologue())
emitDebuggerPrologue(MF, MBB);
- if (!MF.getFrameInfo().hasStackObjects())
- return;
-
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
//
// FIXME: We should be cleaning up these unused SGPR spill frame indices
// somewhere.
- if (hasOnlySGPRSpills(MFI, MF.getFrameInfo()))
- return;
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
= getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
unsigned ScratchWaveOffsetReg
= getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
- assert(ScratchRsrcReg != AMDGPU::NoRegister);
- assert(ScratchWaveOffsetReg != AMDGPU::NoRegister);
+
+ if (ScratchRsrcReg == AMDGPU::NoRegister) {
+ assert(ScratchWaveOffsetReg == AMDGPU::NoRegister);
+ return;
+ }
+
assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
- if (MFI->hasFlatScratchInit())
+ // We need to do the replacement of the private segment buffer and wave offset
+ // register even if there are no stack objects. There could be stores to undef
+ // or a constant without an associated object.
+
+ // FIXME: We still have implicit uses on SGPR spill instructions in case they
+ // need to spill to vector memory. It's likely that will not happen, but at
+ // this point it appears we need the setup. This part of the prolog should be
+ // emitted after frame indices are eliminated.
+
+ if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
emitFlatScratchInit(TII, TRI, MF, MBB);
// We need to insert initialization of the scratch resource descriptor.
unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+
unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
if (ST.isAmdCodeObjectV2()) {
PreloadedPrivateBufferReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
}
- // If we reserved the original input registers, we don't need to copy to the
- // reserved registers.
- if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
- // We should always reserve these 5 registers at the same time.
- assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg &&
- "scratch wave offset and private segment buffer inconsistent");
- return;
- }
+ bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg);
+ bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg);
// We added live-ins during argument lowering, but since they were not used
// they were deleted. We're adding the uses now, so add them back.
- MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
- MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+ if (OffsetRegUsed) {
+ assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister &&
+ "scratch wave offset input is required");
+ MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+ MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+ }
- if (ST.isAmdCodeObjectV2()) {
+ if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
+ assert(ST.isAmdCodeObjectV2());
MRI.addLiveIn(PreloadedPrivateBufferReg);
MBB.addLiveIn(PreloadedPrivateBufferReg);
}
if (&OtherBB == &MBB)
continue;
- OtherBB.addLiveIn(ScratchRsrcReg);
- OtherBB.addLiveIn(ScratchWaveOffsetReg);
+ if (OffsetRegUsed)
+ OtherBB.addLiveIn(ScratchWaveOffsetReg);
+
+ if (ResourceRegUsed)
+ OtherBB.addLiveIn(ScratchRsrcReg);
}
DebugLoc DL;
MachineBasicBlock::iterator I = MBB.begin();
- if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
- // Make sure we emit the copy for the offset first. We may have chosen to
- // copy the buffer resource into a register that aliases the input offset
- // register.
+ // If we reserved the original input registers, we don't need to copy to the
+ // reserved registers.
+
+ bool CopyBuffer = ResourceRegUsed &&
+ PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
+ ScratchRsrcReg != PreloadedPrivateBufferReg;
+
+ // This needs to be careful of the copying order to avoid overwriting one of
+ // the input registers before it's been copied to it's final
+ // destination. Usually the offset should be copied first.
+ bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg,
+ ScratchWaveOffsetReg);
+ if (CopyBuffer && CopyBufferFirst) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+ .addReg(PreloadedPrivateBufferReg, RegState::Kill);
+ }
+
+ if (OffsetRegUsed &&
+ PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
.addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
}
- if (ST.isAmdCodeObjectV2()) {
- // Insert copies from argument register.
- assert(
- !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) &&
- !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg));
-
+ if (CopyBuffer && !CopyBufferFirst) {
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
.addReg(PreloadedPrivateBufferReg, RegState::Kill);
- } else {
+ }
+
+ if (ResourceRegUsed && PreloadedPrivateBufferReg == AMDGPU::NoRegister) {
+ assert(!ST.isAmdCodeObjectV2());
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
BuildMI(MBB, MI, DL, OpDesc)
.addReg(SrcReg, getKillRegState(isKill)) // data
.addFrameIndex(FrameIndex) // addr
- .addMemOperand(MMO);
+ .addMemOperand(MMO)
+ .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+ .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
+ // Add the scratch resource registers as implicit uses because we may end up
+ // needing them, and need to ensure that the reserved registers are
+ // correctly handled.
return;
}
BuildMI(MBB, MI, DL, OpDesc, DestReg)
.addFrameIndex(FrameIndex) // addr
- .addMemOperand(MMO);
+ .addMemOperand(MMO)
+ .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+ .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
return;
}
F, "amdgpu-num-sgpr", MaxNumSGPRs);
// Make sure requested value does not violate subtarget's specifications.
- if (Requested && Requested <= getNumReservedSGPRs(ST))
+ if (Requested && (Requested <= getNumReservedSGPRs(ST)))
Requested = 0;
+ // If more SGPRs are required to support the input user/system SGPRs,
+ // increase to accomodate them.
+ //
+ // FIXME: This really ends up using the requested number of SGPRs + number
+ // of reserved special registers in total. Theoretically you could re-use
+ // the last input registers for these special registers, but this would
+ // require a lot of complexity to deal with the weird aliasing.
+ unsigned NumInputSGPRs = MFI.getNumPreloadedSGPRs();
+ if (Requested && Requested < NumInputSGPRs)
+ Requested = NumInputSGPRs;
+
// Make sure requested value is compatible with values implied by
// default/requested minimum/maximum number of waves per execution unit.
if (Requested && Requested > getMaxNumSGPRs(ST, WavesPerEU.first))
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
-; CHECK-LABEL: {{^}}max_18_sgprs:
+; CHECK-LABEL: {{^}}max_14_sgprs:
+
+; FIXME: Should be ablo to skip this copying of the private segment
+; buffer because all the SGPR spills are to VGPRs.
+
+; CHECK: s_mov_b64 s[6:7], s[2:3]
+; CHECK: s_mov_b64 s[4:5], s[0:1]
+
; CHECK: SGPRBlocks: 1
-; CHECK: NumSGPRsForWavesPerEU: 13
-define void @max_18_sgprs(i32 addrspace(1)* %out1,
+; CHECK: NumSGPRsForWavesPerEU: 14
+define void @max_14_sgprs(i32 addrspace(1)* %out1,
i32 addrspace(1)* %out2,
i32 addrspace(1)* %out3,
i32 addrspace(1)* %out4,
store i32 %four, i32 addrspace(1)* %out4
ret void
}
-attributes #0 = {"amdgpu-num-sgpr"="18"}
+
+; private resource: 4
+; scratch wave offset: 1
+; workgroup ids: 3
+; dispatch id: 2
+; queue ptr: 2
+; flat scratch init: 2
+; ---------------------
+; total: 14
+
+; + reserved vcc, flat_scratch = 18
+
+; Because we can't handle re-using the last few input registers as the
+; special vcc etc. registers (as well as decide to not use the unused
+; features when the number of registers is frozen), this ends up using
+; more than expected.
+
+; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs:
+; TOSGPR: SGPRBlocks: 2
+; TOSGPR: NumSGPRsForWavesPerEU: 18
+
+; TOSMEM: s_mov_b64 s[6:7], s[2:3]
+; TOSMEM: s_mov_b32 s9, s13
+; TOSMEM: s_mov_b64 s[4:5], s[0:1]
+
+; TOSMEM: SGPRBlocks: 2
+; TOSMEM: NumSGPRsForWavesPerEU: 18
+define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
+ i32 addrspace(1)* %out2,
+ i32 addrspace(1)* %out3,
+ i32 addrspace(1)* %out4,
+ i32 %one, i32 %two, i32 %three, i32 %four) #2 {
+ store volatile i32 0, i32* undef
+ %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
+ store volatile i32 %x.0, i32 addrspace(1)* undef
+ %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
+ store volatile i32 %x.0, i32 addrspace(1)* undef
+ %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
+ store volatile i32 %x.0, i32 addrspace(1)* undef
+ %x.3 = call i64 @llvm.amdgcn.dispatch.id()
+ store volatile i64 %x.3, i64 addrspace(1)* undef
+ %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+ store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
+ %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
+ store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef
+
+ store i32 %one, i32 addrspace(1)* %out1
+ store i32 %two, i32 addrspace(1)* %out2
+ store i32 %three, i32 addrspace(1)* %out3
+ store i32 %four, i32 addrspace(1)* %out4
+ ret void
+}
+
+; ALL-LABEL: max_12_sgprs_12_input_sgprs{{$}}
+; ; Make sure copies for input buffer are not clobbered. This requires
+; ; swapping the order the registers are copied from what normally
+; ; happens.
+
+; TOSMEM: s_mov_b64 s[6:7], s[2:3]
+; TOSMEM: s_mov_b64 s[4:5], s[0:1]
+; TOSMEM: s_mov_b32 s3, s11
+
+; ALL: SGPRBlocks: 1
+; ALL: NumSGPRsForWavesPerEU: 16
+define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
+ i32 addrspace(1)* %out2,
+ i32 addrspace(1)* %out3,
+ i32 addrspace(1)* %out4,
+ i32 %one, i32 %two, i32 %three, i32 %four) #2 {
+ store volatile i32 0, i32* undef
+ %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
+ store volatile i32 %x.0, i32 addrspace(1)* undef
+ %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
+ store volatile i32 %x.0, i32 addrspace(1)* undef
+ %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
+ store volatile i32 %x.0, i32 addrspace(1)* undef
+ %x.3 = call i64 @llvm.amdgcn.dispatch.id()
+ store volatile i64 %x.3, i64 addrspace(1)* undef
+ %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+ store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
+
+ store i32 %one, i32 addrspace(1)* %out1
+ store i32 %two, i32 addrspace(1)* %out2
+ store i32 %three, i32 addrspace(1)* %out3
+ store i32 %four, i32 addrspace(1)* %out4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
+declare i32 @llvm.amdgcn.workgroup.id.y() #1
+declare i32 @llvm.amdgcn.workgroup.id.z() #1
+declare i64 @llvm.amdgcn.dispatch.id() #1
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1
+declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #1
+
+attributes #0 = { nounwind "amdgpu-num-sgpr"="14" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "amdgpu-num-sgpr"="12" }
+attributes #3 = { nounwind "amdgpu-num-sgpr"="11" }
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPTNONE %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=OPT %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=iceland -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=OPTNONE %s
+
+; There are no stack objects, but still a private memory access. The
+; private access regiters need to be correctly initialized anyway, and
+; shifted down to the end of the used registers.
; GCN-LABEL: {{^}}store_to_undef:
+; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
; -O0 should assume spilling, so the input scratch resource descriptor
; -should be used directly without any copies.
}
; GCN-LABEL: {{^}}store_to_inttoptr:
+; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
define void @store_to_inttoptr() #0 {
store volatile i32 0, i32* inttoptr (i32 123 to i32*)
ret void
}
; GCN-LABEL: {{^}}load_from_undef:
+; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
define void @load_from_undef() #0 {
%ld = load volatile i32, i32* undef
ret void
}
; GCN-LABEL: {{^}}load_from_inttoptr:
+; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
define void @load_from_inttoptr() #0 {
%ld = load volatile i32, i32* inttoptr (i32 123 to i32*)
ret void
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
; These tests check that the compiler won't crash when it needs to spill
; SGPRs.
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=TONGA %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=TONGA %s
; On Tonga and Iceland, limited SGPR availability means care must be taken to
; allocate scratch registers correctly. Check that this test compiles without
; XXX - Why does it like to use vcc?
; GCN-LABEL: {{^}}spill_m0:
+; TOSMEM: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+
; GCN: s_cmp_lg_u32
; TOVGPR: s_mov_b32 vcc_hi, m0
br i1 %cc, label %if, label %else
if:
- store volatile <4 x float> %dtex, <4 x float>* undef
+ store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef
unreachable
else: