if a higher numbered
register is used
explicitly.
+ ".agpr_count" integer Required Number of accumulator
+ registers required by
+ each work-item for
+ GFX90A, GFX908.
".max_flat_workgroup_size" integer Required Maximum flat
work-group size
supported by the
".lds_size" integer Local Data Share size in bytes.
".perf_data_buffer_size" integer Performance data buffer size in bytes.
".vgpr_count" integer Number of VGPRs used.
+ ".agpr_count" integer Number of AGPRs used.
".sgpr_count" integer Number of SGPRs used.
".vgpr_limit" integer If non-zero, indicates the shader was compiled with a
directive to instruct the compiler to limit the VGPR usage to
MD->setEntryPoint(CC, MF.getFunction().getName());
MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+
+ // Only set AGPRs for supported devices
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ if (STM.hasMAIInsts()) {
+ MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
+ }
+
MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
if (AMDGPU::isCompute(CC)) {
MD->setSpiPsInputAddr(MFI->getPSInputAddr());
}
- const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
if (STM.isWave32())
MD->setWave32(MF.getFunction().getCallingConv());
}
Kern.getDocument()->getNode(STM.getWavefrontSize());
Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR);
Kern[".vgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumVGPR);
+
+ // Only add AGPR count to metadata for supported devices
+ if (STM.hasMAIInsts()) {
+ Kern[".agpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumAccVGPR);
+ }
+
Kern[".max_flat_workgroup_size"] =
Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
Kern[".sgpr_spill_count"] =
class KernelScopeInfo {
int SgprIndexUnusedMin = -1;
int VgprIndexUnusedMin = -1;
+ int AgprIndexUnusedMin = -1;
MCContext *Ctx = nullptr;
+ MCSubtargetInfo const *MSTI = nullptr;
void usesSgprAt(int i) {
if (i >= SgprIndexUnusedMin) {
if (Ctx) {
MCSymbol* const Sym =
Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count"));
- Sym->setVariableValue(MCConstantExpr::create(VgprIndexUnusedMin, *Ctx));
+ int totalVGPR = getTotalNumVGPRs(isGFX90A(*MSTI), AgprIndexUnusedMin,
+ VgprIndexUnusedMin);
+ Sym->setVariableValue(MCConstantExpr::create(totalVGPR, *Ctx));
+ }
+ }
+ }
+
+ void usesAgprAt(int i) {
+ // Instruction will error in AMDGPUAsmParser::MatchAndEmitInstruction
+ if (!hasMAIInsts(*MSTI))
+ return;
+
+ if (i >= AgprIndexUnusedMin) {
+ AgprIndexUnusedMin = ++i;
+ if (Ctx) {
+ MCSymbol* const Sym =
+ Ctx->getOrCreateSymbol(Twine(".kernel.agpr_count"));
+ Sym->setVariableValue(MCConstantExpr::create(AgprIndexUnusedMin, *Ctx));
+
+ // Also update vgpr_count (dependent on agpr_count for gfx908/gfx90a)
+ MCSymbol* const vSym =
+ Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count"));
+ int totalVGPR = getTotalNumVGPRs(isGFX90A(*MSTI), AgprIndexUnusedMin,
+ VgprIndexUnusedMin);
+ vSym->setVariableValue(MCConstantExpr::create(totalVGPR, *Ctx));
}
}
}
void initialize(MCContext &Context) {
Ctx = &Context;
+ MSTI = Ctx->getSubtargetInfo();
+
usesSgprAt(SgprIndexUnusedMin = -1);
usesVgprAt(VgprIndexUnusedMin = -1);
+ if (hasMAIInsts(*MSTI)) {
+ usesAgprAt(AgprIndexUnusedMin = -1);
+ }
}
void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) {
switch (RegKind) {
case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break;
- case IS_AGPR: // fall through
+ case IS_AGPR: usesAgprAt(DwordRegIndex + RegWidth - 1); break;
case IS_VGPR: usesVgprAt(DwordRegIndex + RegWidth - 1); break;
default: break;
}
return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
}
+bool hasMAIInsts(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureMAIInsts];
+}
+
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR,
int32_t ArgNumVGPR) {
if (has90AInsts && ArgNumAGPR)
bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
bool isGFX90A(const MCSubtargetInfo &STI);
bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
+bool hasMAIInsts(const MCSubtargetInfo &STI);
int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
/// Is Reg - scalar register
getHwStage(CC)[".vgpr_count"] = MsgPackDoc.getNode(Val);
}
+// Set the number of used agprs in the metadata.
+void AMDGPUPALMetadata::setNumUsedAgprs(CallingConv::ID CC, unsigned Val) {
+ getHwStage(CC)[".agpr_count"] = Val;
+}
+
// Set the number of used sgprs in the metadata. This is an optional advisory
// record for logging etc; wave dispatch actually uses the rsrc1 register for
// the shader stage to determine the number of sgprs to allocate.
// the shader stage to determine the number of vgprs to allocate.
void setNumUsedVgprs(unsigned CC, unsigned Val);
+ // Set the number of used agprs in the metadata. This is an optional advisory
+ // record for logging etc;
+ void setNumUsedAgprs(unsigned CC, unsigned Val);
+
// Set the number of used sgprs in the metadata. This is an optional advisory
// record for logging etc; wave dispatch actually uses the rsrc1 register for
// the shader stage to determine the number of sgprs to allocate.
--- /dev/null
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx90a < %s | FileCheck -check-prefixes=CHECK,GFX90A %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx908 < %s | FileCheck -check-prefixes=CHECK,GFX908 %s
+
+; COM: Adapted from agpr-register-count.ll
+; COM: GFX900 and below should not have .agpr_count present in the metadata
+
+
+; CHECK: .type kernel_32_agprs
+; CHECK: NumAgprs: 32
+define amdgpu_kernel void @kernel_32_agprs() #0 {
+bb:
+ call void asm sideeffect "", "~{v8}" ()
+ call void asm sideeffect "", "~{a31}" ()
+ ret void
+}
+
+; CHECK: .type kernel_0_agprs
+; CHECK: NumAgprs: 0
+define amdgpu_kernel void @kernel_0_agprs() #0 {
+bb:
+ call void asm sideeffect "", "~{v0}" ()
+ ret void
+}
+
+; CHECK: .type kernel_40_vgprs
+; CHECK: NumAgprs: 16
+define amdgpu_kernel void @kernel_40_vgprs() #0 {
+bb:
+ call void asm sideeffect "", "~{v39}" ()
+ call void asm sideeffect "", "~{a15}" ()
+ ret void
+}
+
+; CHECK: .type kernel_max_gprs
+; CHECK: NumAgprs: 256
+define amdgpu_kernel void @kernel_max_gprs() #0 {
+bb:
+ call void asm sideeffect "", "~{v255}" ()
+ call void asm sideeffect "", "~{a255}" ()
+ ret void
+}
+
+; CHECK: .type func_32_agprs
+; CHECK: NumAgprs: 32
+define void @func_32_agprs() #0 {
+bb:
+ call void asm sideeffect "", "~{v8}" ()
+ call void asm sideeffect "", "~{a31}" ()
+ ret void
+}
+
+; CHECK: .type kernel_call_func_32_agprs
+; CHECK: NumAgprs: 32
+define amdgpu_kernel void @kernel_call_func_32_agprs() #0 {
+bb:
+ call void @func_32_agprs() #0
+ ret void
+}
+
+declare void @undef_func()
+
+; CHECK: .type kernel_call_undef_func
+; CHECK: NumAgprs: 32
+define amdgpu_kernel void @kernel_call_undef_func() #0 {
+bb:
+ call void @undef_func()
+ ret void
+}
+
+; CHECK: ---
+; CHECK: amdpal.pipelines:
+; GFX90A: agpr_count: 0x20
+; GFX90A: vgpr_count: 0x40
+
+; GFX908: agpr_count: 0x20
+; GFX908: vgpr_count: 0x20
+
+attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" }
--- /dev/null
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX90A %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX908 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX801 %s
+
+; COM: Adapted from agpr-register-count.ll
+; COM: GFX900 and below should not have .agpr_count present in the metadata
+
+; CHECK: ---
+; CHECK: amdhsa.kernels:
+
+; GFX90A: - .agpr_count: 32
+; GFX908: - .agpr_count: 32
+; GFX801-NOT: - .agpr_count:
+; CHECK: .name: kernel_32_agprs
+; GFX90A: .vgpr_count: 44
+; GFX908: .vgpr_count: 32
+; GFX801: .vgpr_count: 9
+define amdgpu_kernel void @kernel_32_agprs() #0 {
+bb:
+ call void asm sideeffect "", "~{v8}" ()
+ call void asm sideeffect "", "~{a31}" ()
+ ret void
+}
+
+; GFX90A: - .agpr_count: 0
+; GFX908: - .agpr_count: 0
+; GFX801-NOT: - .agpr_count:
+; CHECK: .name: kernel_0_agprs
+; GFX90A: .vgpr_count: 1
+; GFX908: .vgpr_count: 1
+; GFX801: .vgpr_count: 1
+define amdgpu_kernel void @kernel_0_agprs() #0 {
+bb:
+ call void asm sideeffect "", "~{v0}" ()
+ ret void
+}
+
+; GFX90A: - .agpr_count: 16
+; GFX908: - .agpr_count: 16
+; GFX801-NOT: - .agpr_count:
+; CHECK: .name: kernel_40_vgprs
+; GFX90A: .vgpr_count: 56
+; GFX908: .vgpr_count: 40
+; GFX801: .vgpr_count: 40
+define amdgpu_kernel void @kernel_40_vgprs() #0 {
+bb:
+ call void asm sideeffect "", "~{v39}" ()
+ call void asm sideeffect "", "~{a15}" ()
+ ret void
+}
+
+; GFX90A: - .agpr_count: 256
+; GFX908: - .agpr_count: 256
+; GFX801-NOT: - .agpr_count:
+; CHECK: .name: kernel_max_gprs
+; GFX90A: .vgpr_count: 512
+; GFX908: .vgpr_count: 256
+; GFX801: .vgpr_count: 256
+define amdgpu_kernel void @kernel_max_gprs() #0 {
+bb:
+ call void asm sideeffect "", "~{v255}" ()
+ call void asm sideeffect "", "~{a255}" ()
+ ret void
+}
+
+define void @func_32_agprs() #0 {
+bb:
+ call void asm sideeffect "", "~{v8}" ()
+ call void asm sideeffect "", "~{a31}" ()
+ ret void
+}
+
+; GFX90A: - .agpr_count: 32
+; GFX908: - .agpr_count: 32
+; GFX801-NOT: - .agpr_count:
+; CHECK: .name: kernel_call_func_32_agprs
+; GFX90A: .vgpr_count: 44
+; GFX908: .vgpr_count: 32
+; GFX801: .vgpr_count: 9
+define amdgpu_kernel void @kernel_call_func_32_agprs() #0 {
+bb:
+ call void @func_32_agprs() #0
+ ret void
+}
+
+declare void @undef_func()
+
+; GFX90A: - .agpr_count: 32
+; GFX908: - .agpr_count: 32
+; GFX801-NOT: - .agpr_count:
+; CHECK: .name: kernel_call_undef_func
+; GFX90A: .vgpr_count: 64
+; GFX908: .vgpr_count: 32
+; GFX801: .vgpr_count: 32
+define amdgpu_kernel void @kernel_call_undef_func() #0 {
+bb:
+ call void @undef_func()
+ ret void
+}
+
+attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" }
--- /dev/null
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=CHECK,GFX908 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=CHECK,GFX90A %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx801 < %s | FileCheck -check-prefixes=CHECK,GFX801 %s
+
+; COM: Comments for each kernel
+; CHECK: kernel_32_agprs
+; GFX908: ; NumVgprs: 9
+; GFX908 ; NumAgprs: 32
+; GFX908 ; TotalNumVgprs: 32
+
+; GFX90A: ; NumVgprs: 9
+; GFX90A ; NumAgprs: 32
+; GFX90A ; TotalNumVgprs: 44
+
+; GFX801: ; NumVgprs: 9
+
+; CHECK: kernel_40_vgprs
+; GFX908: ; NumVgprs: 40
+; GFX908 ; NumAgprs: 16
+; GFX908 ; TotalNumVgprs: 40
+
+; GFX90A: ; NumVgprs: 40
+; GFX90A ; NumAgprs: 16
+; GFX90A ; TotalNumVgprs: 56
+
+; GFX801: ; NumVgprs: 40
+
+; COM: Metadata
+; GFX908: - .agpr_count: 32
+; GFX908: .vgpr_count: 32
+
+; GFX90A: - .agpr_count: 32
+; GFX90A: .vgpr_count: 44
+
+; GFX801: .vgpr_count: 9
+define amdgpu_kernel void @kernel_32_agprs() #0 {
+bb:
+ call void asm sideeffect "", "~{v8}" ()
+ call void asm sideeffect "", "~{a31}" ()
+ ret void
+}
+
+; GFX908: - .agpr_count: 16
+; GFX908: .vgpr_count: 40
+
+; GFX90A: - .agpr_count: 16
+; GFX90A: .vgpr_count: 56
+
+; GFX801: .vgpr_count: 40
+define amdgpu_kernel void @kernel_40_vgprs() #0 {
+bb:
+ call void asm sideeffect "", "~{v39}" ()
+ call void asm sideeffect "", "~{a15}" ()
+ ret void
+}
+
+attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" }
--- /dev/null
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck -check-prefixes=GFX90A %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx908 %s 2>&1 | FileCheck -check-prefixes=GFX908 %s
+// Based on sym_kernel_scope.s
+
+.byte .kernel.agpr_count
+// CHECK: .byte 0
+.byte .kernel.vgpr_count
+// CHECK: .byte 0
+
+ v_accvgpr_write_b32 a0, v6
+ v_accvgpr_read_b32 v3, a3
+ s_endpgm
+.byte .kernel.agpr_count
+// GFX90A: .byte 4
+// GFX908: .byte 4
+.byte .kernel.vgpr_count
+// GFX90A: .byte 12
+// GFX908: .byte 7
+
+.amdgpu_hsa_kernel K1
+K1:
+.byte .kernel.agpr_count
+// CHECK: .byte 0
+.byte .kernel.vgpr_count
+// CHECK: .byte 0
+ v_accvgpr_write_b32 a44, v6
+ s_endpgm
+.byte .kernel.agpr_count
+// GFX90A: .byte 45
+// GFX908: .byte 45
+.byte .kernel.vgpr_count
+// GFX90A: .byte 53
+// GFX908: .byte 45
+
+.amdgpu_hsa_kernel K2
+.byte .kernel.agpr_count
+// CHECK: .byte 0
+.byte .kernel.vgpr_count
+// CHECK: .byte 0
+K2:
+ v_mfma_f32_4x4x1f32 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3
+ s_endpgm
+.byte .kernel.agpr_count
+// GFX90A: .byte 4
+// GFX908: .byte 4
+.byte .kernel.vgpr_count
+// GFX90A: .byte 8
+// GFX908: .byte 4
+
+.text
+.amdgpu_hsa_kernel K3
+K3:
+ v_accvgpr_read_b32 v[0], a0
+ v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] cbsz:1 abid:2 blgp:3
+ s_endpgm
+
+.byte .kernel.agpr_count
+// GFX90A: .byte 16
+// GFX908: .byte 16
+.byte .kernel.vgpr_count
+// GFX90A: .byte 20
+// GFX908: .byte 16