From 4df465bd5e44329d8a0e98af9f5e2738a6a34563 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 2 Dec 2014 21:28:53 +0000 Subject: [PATCH] R600/SI: Move more information into SIProgramInfo struct llvm-svn: 223154 --- llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp | 96 ++++++++++++---------- llvm/lib/Target/R600/AMDGPUAsmPrinter.h | 25 +++++- llvm/lib/Target/R600/SIDefines.h | 9 +- llvm/test/CodeGen/R600/local-memory-two-objects.ll | 2 +- llvm/test/CodeGen/R600/local-memory.ll | 4 +- 5 files changed, 83 insertions(+), 53 deletions(-) diff --git a/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp index b2ee3c8..b797179 100644 --- a/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -240,6 +240,8 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFunction &MF) const { + const AMDGPUSubtarget &STM = TM.getSubtarget(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); uint64_t CodeSize = 0; unsigned MaxSGPR = 0; unsigned MaxVGPR = 0; @@ -340,6 +342,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.NumVGPR = MaxVGPR + 1; ProgInfo.NumSGPR = MaxSGPR + 1; + ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; + ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8; // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode // register. ProgInfo.FloatMode = getFPMode(MF); @@ -356,23 +360,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.FlatUsed = FlatUsed; ProgInfo.VCCUsed = VCCUsed; ProgInfo.CodeLen = CodeSize; -} - -static unsigned getRsrcReg(unsigned ShaderType) { - switch (ShaderType) { - default: // Fall through - case ShaderType::COMPUTE: return R_00B848_COMPUTE_PGM_RSRC1; - case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; - case ShaderType::PIXEL: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; - case ShaderType::VERTEX: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; - } -} - -void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, - const SIProgramInfo &KernelInfo) { - const AMDGPUSubtarget &STM = TM.getSubtarget(); - const SIMachineFunctionInfo *MFI = MF.getInfo(); - unsigned RsrcReg = getRsrcReg(MFI->getShaderType()); unsigned LDSAlignShift; if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { @@ -386,58 +373,77 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, unsigned LDSSpillSize = MFI->LDSWaveSpillSize * MFI->getMaximumWorkGroupSize(MF); - unsigned LDSBlocks = - RoundUpToAlignment(MFI->LDSSize + LDSSpillSize, - 1 << LDSAlignShift) >> LDSAlignShift; + ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize; + ProgInfo.LDSBlocks = + RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; // Scratch is allocated in 256 dword blocks. unsigned ScratchAlignShift = 10; // We need to program the hardware with the amount of scratch memory that - // is used by the entire wave. KernelInfo.ScratchSize is the amount of + // is used by the entire wave. ProgInfo.ScratchSize is the amount of // scratch memory used per thread. - unsigned ScratchBlocks = - RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(), + ProgInfo.ScratchBlocks = + RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(), 1 << ScratchAlignShift) >> ScratchAlignShift; - unsigned VGPRBlocks = (KernelInfo.NumVGPR - 1) / 4; - unsigned SGPRBlocks = (KernelInfo.NumSGPR - 1) / 8; + ProgInfo.ComputePGMRSrc1 = + S_00B848_VGPRS(ProgInfo.VGPRBlocks) | + S_00B848_SGPRS(ProgInfo.SGPRBlocks) | + S_00B848_PRIORITY(ProgInfo.Priority) | + S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | + S_00B848_PRIV(ProgInfo.Priv) | + S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | + S_00B848_IEEE_MODE(ProgInfo.DebugMode) | + S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + + ProgInfo.ComputePGMRSrc2 = + S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | + S_00B84C_USER_SGPR(MFI->NumUserSGPRs) | + S_00B84C_TGID_X_EN(1) | + S_00B84C_TGID_Y_EN(1) | + S_00B84C_TGID_Z_EN(1) | + S_00B84C_TG_SIZE_EN(1) | + S_00B84C_TIDIG_COMP_CNT(2) | + S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks); +} + +static unsigned getRsrcReg(unsigned ShaderType) { + switch (ShaderType) { + default: // Fall through + case ShaderType::COMPUTE: return R_00B848_COMPUTE_PGM_RSRC1; + case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; + case ShaderType::PIXEL: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; + case ShaderType::VERTEX: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; + } +} + +void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, + const SIProgramInfo &KernelInfo) { + const SIMachineFunctionInfo *MFI = MF.getInfo(); + unsigned RsrcReg = getRsrcReg(MFI->getShaderType()); if (MFI->getShaderType() == ShaderType::COMPUTE) { OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); - const uint32_t ComputePGMRSrc1 = - S_00B848_VGPRS(VGPRBlocks) | - S_00B848_SGPRS(SGPRBlocks) | - S_00B848_PRIORITY(KernelInfo.Priority) | - S_00B848_FLOAT_MODE(KernelInfo.FloatMode) | - S_00B848_PRIV(KernelInfo.Priv) | - S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) | - S_00B848_IEEE_MODE(KernelInfo.DebugMode) | - S_00B848_IEEE_MODE(KernelInfo.IEEEMode); - - OutStreamer.EmitIntValue(ComputePGMRSrc1, 4); + OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc1, 4); OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); - const uint32_t ComputePGMRSrc2 = - S_00B84C_LDS_SIZE(LDSBlocks) | - S_00B02C_SCRATCH_EN(ScratchBlocks > 0); - - OutStreamer.EmitIntValue(ComputePGMRSrc2, 4); + OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc2, 4); OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); - OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4); + OutStreamer.EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4); // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = // 0" comment but I don't see a corresponding field in the register spec. } else { OutStreamer.EmitIntValue(RsrcReg, 4); - OutStreamer.EmitIntValue(S_00B028_VGPRS(VGPRBlocks) | - S_00B028_SGPRS(SGPRBlocks), 4); + OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | + S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); } if (MFI->getShaderType() == ShaderType::PIXEL) { OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); - OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4); + OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); OutStreamer.EmitIntValue(MFI->PSInputAddr, 4); } diff --git a/llvm/lib/Target/R600/AMDGPUAsmPrinter.h b/llvm/lib/Target/R600/AMDGPUAsmPrinter.h index b9a0767..6fe33c5 100644 --- a/llvm/lib/Target/R600/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/R600/AMDGPUAsmPrinter.h @@ -24,8 +24,8 @@ class AMDGPUAsmPrinter : public AsmPrinter { private: struct SIProgramInfo { SIProgramInfo() : - NumVGPR(0), - NumSGPR(0), + VGPRBlocks(0), + SGPRBlocks(0), Priority(0), FloatMode(0), Priv(0), @@ -33,13 +33,19 @@ private: DebugMode(0), IEEEMode(0), ScratchSize(0), + ComputePGMRSrc1(0), + LDSBlocks(0), + ScratchBlocks(0), + ComputePGMRSrc2(0), + NumVGPR(0), + NumSGPR(0), FlatUsed(false), VCCUsed(false), CodeLen(0) {} // Fields set in PGM_RSRC1 pm4 packet. - uint32_t NumVGPR; - uint32_t NumSGPR; + uint32_t VGPRBlocks; + uint32_t SGPRBlocks; uint32_t Priority; uint32_t FloatMode; uint32_t Priv; @@ -48,6 +54,17 @@ private: uint32_t IEEEMode; uint32_t ScratchSize; + uint64_t ComputePGMRSrc1; + + // Fields set in PGM_RSRC2 pm4 packet. + uint32_t LDSBlocks; + uint32_t ScratchBlocks; + + uint64_t ComputePGMRSrc2; + + uint32_t NumVGPR; + uint32_t NumSGPR; + uint32_t LDSSize; bool FlatUsed; // Bonus information for debugging. diff --git a/llvm/lib/Target/R600/SIDefines.h b/llvm/lib/Target/R600/SIDefines.h index bc44e40..759ed1b 100644 --- a/llvm/lib/Target/R600/SIDefines.h +++ b/llvm/lib/Target/R600/SIDefines.h @@ -71,7 +71,14 @@ namespace SIOutMods { #define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) #define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) #define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C -#define S_00B02C_SCRATCH_EN(x) (((x) & 0x1) << 0) +#define S_00B84C_SCRATCH_EN(x) (((x) & 0x1) << 0) +#define S_00B84C_USER_SGPR(x) (((x) & 0x1F) << 1) +#define S_00B84C_TGID_X_EN(x) (((x) & 0x1) << 7) +#define S_00B84C_TGID_Y_EN(x) (((x) & 0x1) << 8) +#define S_00B84C_TGID_Z_EN(x) (((x) & 0x1) << 9) +#define S_00B84C_TG_SIZE_EN(x) (((x) & 0x1) << 10) +#define S_00B84C_TIDIG_COMP_CNT(x) (((x) & 0x03) << 11) + #define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15) #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC diff --git a/llvm/test/CodeGen/R600/local-memory-two-objects.ll b/llvm/test/CodeGen/R600/local-memory-two-objects.ll index 88ef05d..bd00d50 100644 --- a/llvm/test/CodeGen/R600/local-memory-two-objects.ll +++ b/llvm/test/CodeGen/R600/local-memory-two-objects.ll @@ -11,7 +11,7 @@ ; EG-CHECK: .long 166120 ; EG-CHECK-NEXT: .long 8 ; SI-CHECK: .long 47180 -; SI-CHECK-NEXT: .long 32768 +; SI-CHECK-NEXT: .long 38792 ; We would like to check the the lds writes are using different ; addresses, but due to variations in the scheduler, we can't do diff --git a/llvm/test/CodeGen/R600/local-memory.ll b/llvm/test/CodeGen/R600/local-memory.ll index 9b13cb2..b73c23d 100644 --- a/llvm/test/CodeGen/R600/local-memory.ll +++ b/llvm/test/CodeGen/R600/local-memory.ll @@ -10,9 +10,9 @@ ; EG: .long 166120 ; EG-NEXT: .long 128 ; SI: .long 47180 -; SI-NEXT: .long 65536 +; SI-NEXT: .long 71560 ; CI: .long 47180 -; CI-NEXT: .long 32768 +; CI-NEXT: .long 38792 ; EG: LDS_WRITE ; SI-NOT: s_wqm_b64 -- 2.7.4