From 220147d536f35d06c7037185d976ecae9df8f32c Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Tue, 22 Nov 2022 09:13:57 +0000 Subject: [PATCH] [AMDGPU] Make aperture registers 64 bit Makes the SRC_(SHARED|PRIVATE)_(BASE|LIMIT) registers 64 bit instead of 32. They're still usable as 32 bit operands by using the _LO suffix. Preparation for D137542 Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D137767 --- .../Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp | 4 ++ .../Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 8 ++++ .../AMDGPU/Disassembler/AMDGPUDisassembler.cpp | 10 +++-- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 2 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 52 +++++++++++++++++----- 5 files changed, 59 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index ede2b2b..547ca65 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -273,9 +273,13 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( case AMDGPU::M0: case AMDGPU::M0_LO16: case AMDGPU::M0_HI16: + case AMDGPU::SRC_SHARED_BASE_LO: case AMDGPU::SRC_SHARED_BASE: + case AMDGPU::SRC_SHARED_LIMIT_LO: case AMDGPU::SRC_SHARED_LIMIT: + case AMDGPU::SRC_PRIVATE_BASE_LO: case AMDGPU::SRC_PRIVATE_BASE: + case AMDGPU::SRC_PRIVATE_LIMIT_LO: case AMDGPU::SRC_PRIVATE_LIMIT: case AMDGPU::SGPR_NULL: case AMDGPU::SGPR_NULL64: diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 1f7bb1b..882cdf2 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -2338,9 +2338,13 @@ void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const { static bool isInlineValue(unsigned Reg) { switch (Reg) { + case AMDGPU::SRC_SHARED_BASE_LO: case AMDGPU::SRC_SHARED_BASE: + case AMDGPU::SRC_SHARED_LIMIT_LO: case AMDGPU::SRC_SHARED_LIMIT: + case AMDGPU::SRC_PRIVATE_BASE_LO: case AMDGPU::SRC_PRIVATE_BASE: + case AMDGPU::SRC_PRIVATE_LIMIT_LO: case AMDGPU::SRC_PRIVATE_LIMIT: case AMDGPU::SRC_POPS_EXITING_WAVE_ID: return true; @@ -5737,9 +5741,13 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, return hasSGPR104_SGPR105(); switch (RegNo) { + case AMDGPU::SRC_SHARED_BASE_LO: case AMDGPU::SRC_SHARED_BASE: + case AMDGPU::SRC_SHARED_LIMIT_LO: case AMDGPU::SRC_SHARED_LIMIT: + case AMDGPU::SRC_PRIVATE_BASE_LO: case AMDGPU::SRC_PRIVATE_BASE: + case AMDGPU::SRC_PRIVATE_LIMIT_LO: case AMDGPU::SRC_PRIVATE_LIMIT: return isGFX9Plus(); case AMDGPU::SRC_POPS_EXITING_WAVE_ID: diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 9b9d405..3e189e2 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -1636,6 +1636,7 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { using namespace AMDGPU; switch (Val) { + // clang-format off case 102: return createRegOperand(FLAT_SCR_LO); case 103: return createRegOperand(FLAT_SCR_HI); case 104: return createRegOperand(XNACK_MASK_LO); @@ -1652,16 +1653,17 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { return isGFX11Plus() ? createRegOperand(M0) : createRegOperand(SGPR_NULL); case 126: return createRegOperand(EXEC_LO); case 127: return createRegOperand(EXEC_HI); - case 235: return createRegOperand(SRC_SHARED_BASE); - case 236: return createRegOperand(SRC_SHARED_LIMIT); - case 237: return createRegOperand(SRC_PRIVATE_BASE); - case 238: return createRegOperand(SRC_PRIVATE_LIMIT); + case 235: return createRegOperand(SRC_SHARED_BASE_LO); + case 236: return createRegOperand(SRC_SHARED_LIMIT_LO); + case 237: return createRegOperand(SRC_PRIVATE_BASE_LO); + case 238: return createRegOperand(SRC_PRIVATE_LIMIT_LO); case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID); case 251: return createRegOperand(SRC_VCCZ); case 252: return createRegOperand(SRC_EXECZ); case 253: return createRegOperand(SRC_SCC); case 254: return createRegOperand(LDS_DIRECT); default: break; + // clang-format on } return errOperand(Val, "unknown operand encoding " + Twine(Val)); } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 354e40d..0f10ace 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -563,7 +563,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); - // Reserve the memory aperture registers. + // Reserve the memory aperture registers reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index fae76be..b1e8761 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -232,12 +232,36 @@ def SGPR_NULL64 : let isConstant = true; } -let isConstant = true in { -defm SRC_SHARED_BASE : SIRegLoHi16<"src_shared_base", 235>; -defm SRC_SHARED_LIMIT : SIRegLoHi16<"src_shared_limit", 236>; -defm SRC_PRIVATE_BASE : SIRegLoHi16<"src_private_base", 237>; -defm SRC_PRIVATE_LIMIT : SIRegLoHi16<"src_private_limit", 238>; -} // isConstant = true +// Aperture registers are 64 bit registers with a LO/HI 32 bit. +// HI 32 bit cannot be used, and LO 32 is used by instructions +// with 32 bit sources. +// +// Note that the low 32 bits are essentially useless as they +// don't contain the lower 32 bits of the address - they are in +// the high 32 bits. The lower 32 bits are always zero (for base) or +// -1 (for limit). Since we cannot access the high 32 bits, when we +// need them, we need to do a 64 bit load and extract the bits manually. +multiclass ApertureRegister regIdx> { + let isConstant = true in { + // FIXME: We shouldn't need to define subregisters for these (nor add them to any 16 bit + // register classes), but if we don't it seems to confuse the TableGen + // backend and we end up with a lot of weird register pressure sets and classes. + defm _LO : SIRegLoHi16 ; + defm _HI : SIRegLoHi16 <"", regIdx>; + + def "" : RegisterWithSubRegs(NAME#_LO), !cast(NAME#_HI)]> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = !cast(NAME#_LO).HWEncoding; + } + } // isConstant = true +} + +defm SRC_SHARED_BASE : ApertureRegister<"src_shared_base", 235>; +defm SRC_SHARED_LIMIT : ApertureRegister<"src_shared_limit", 236>; +defm SRC_PRIVATE_BASE : ApertureRegister<"src_private_base", 237>; +defm SRC_PRIVATE_LIMIT : ApertureRegister<"src_private_limit", 238>; + defm SRC_POPS_EXITING_WAVE_ID : SIRegLoHi16<"src_pops_exiting_wave_id", 239>; // Not addressable @@ -664,8 +688,9 @@ let GeneratePressureSet = 0, HasSGPR = 1 in { // See comments in SIInstructions.td for more info. def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI, - SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, - SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID, + SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE_LO, + SRC_SHARED_LIMIT_LO, SRC_PRIVATE_BASE_LO, SRC_PRIVATE_LIMIT_LO, SRC_SHARED_BASE_HI, + SRC_SHARED_LIMIT_HI, SRC_PRIVATE_BASE_HI, SRC_PRIVATE_LIMIT_HI, SRC_POPS_EXITING_WAVE_ID, SRC_VCCZ, SRC_EXECZ, SRC_SCC)> { let AllocationPriority = 0; } @@ -673,9 +698,11 @@ def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2 def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add SGPR_LO16, VCC_LO_LO16, VCC_HI_LO16, FLAT_SCR_LO_LO16, FLAT_SCR_HI_LO16, XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, SGPR_NULL_HI_LO16, TTMP_LO16, - TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO16, - SRC_SHARED_LIMIT_LO16, SRC_PRIVATE_BASE_LO16, SRC_PRIVATE_LIMIT_LO16, - SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16)> { + TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO_LO16, + SRC_SHARED_LIMIT_LO_LO16, SRC_PRIVATE_BASE_LO_LO16, SRC_PRIVATE_LIMIT_LO_LO16, + SRC_SHARED_BASE_HI_LO16, SRC_SHARED_LIMIT_HI_LO16, SRC_PRIVATE_BASE_HI_LO16, + SRC_PRIVATE_LIMIT_HI_LO16, SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, + SRC_EXECZ_LO16, SRC_SCC_LO16)> { let Size = 16; let AllocationPriority = 0; } @@ -737,7 +764,8 @@ def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, } def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, - (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SGPR_NULL64, TTMP_64, TBA, TMA)> { + (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SGPR_NULL64, SRC_SHARED_BASE, + SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA)> { let CopyCost = 1; let AllocationPriority = 1; let HasSGPR = 1; -- 2.7.4