From 77480556c41fbca36b918323c69cb77f8e02b56c Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 4 May 2022 16:33:32 +0100 Subject: [PATCH] [RegAllocGreedy] New hook regClassPriorityTrumpsGlobalness Add a new TargetRegisterInfo hook to allow targets to tweak the priority of live ranges, so that AllocationPriority of the register class will be treated as more important than whether the range is local to a basic block or global. This is determined per-MachineFunction. Differential Revision: https://reviews.llvm.org/D125102 --- llvm/include/llvm/CodeGen/TargetRegisterInfo.h | 10 +++++ llvm/include/llvm/Target/Target.td | 2 + llvm/lib/CodeGen/RegAllocGreedy.cpp | 20 ++++++++- llvm/lib/CodeGen/RegAllocGreedy.h | 4 ++ .../CodeGen/AMDGPU/greedy-liverange-priority.mir | 48 ++++++++++++++++++++++ 5 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/greedy-liverange-priority.mir diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index 1e316f0..04369a5 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -55,6 +55,8 @@ public: const LaneBitmask LaneMask; /// Classes with a higher priority value are assigned first by register /// allocators using a greedy heuristic. The value is in the range [0,63]. + /// Values >= 32 should be used with care since they may overlap with other + /// fields in the allocator's priority heuristics. const uint8_t AllocationPriority; /// Configurable target specific flags. const uint8_t TSFlags; @@ -1076,6 +1078,14 @@ public: return false; } + /// When prioritizing live ranges in register allocation, if this hook returns + /// true then the AllocationPriority of the register class will be treated as + /// more important than whether the range is local to a basic block or global. + virtual bool + regClassPriorityTrumpsGlobalness(const MachineFunction &MF) const { + return false; + } + //===--------------------------------------------------------------------===// /// Debug information queries. diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td index 7868106..c5b2462 100644 --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -279,6 +279,8 @@ class RegisterClass regTypes, int alignment, // heuristic. Classes with higher priority values are assigned first. This is // useful as it is sometimes beneficial to assign registers to highly // constrained classes first. The value has to be in the range [0,63]. + // Values >= 32 should be used with care since they may overlap with other + // fields in the allocator's priority heuristics. int AllocationPriority = 0; // Generate register pressure set for this register class and any class diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index 31721da..0bcd2c5 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -128,6 +128,13 @@ static cl::opt GrowRegionComplexityBudget( "limit its budget and bail out once we reach the limit."), cl::init(10000), cl::Hidden); +static cl::opt GreedyRegClassPriorityTrumpsGlobalness( + "greedy-regclass-priority-trumps-globalness", + cl::desc("Change the greedy register allocator's live range priority " + "calculation to make the AllocationPriority of the register class " + "more important then whether the range is global"), + cl::Hidden); + static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator", createGreedyRegisterAllocator); @@ -305,6 +312,7 @@ void RAGreedy::enqueue(PQueue &CurQueue, const LiveInterval *LI) { const TargetRegisterClass &RC = *MRI->getRegClass(Reg); bool ForceGlobal = !ReverseLocal && (Size / SlotIndex::InstrDist) > (2 * RCI.getNumAllocatableRegs(&RC)); + unsigned GlobalBit = 0; if (Stage == RS_Assign && !ForceGlobal && !LI->empty() && LIS->intervalIsInOneMBB(*LI)) { @@ -323,9 +331,13 @@ void RAGreedy::enqueue(PQueue &CurQueue, const LiveInterval *LI) { // Allocate global and split ranges in long->short order. Long ranges that // don't fit should be spilled (or split) ASAP so they don't create // interference. Mark a bit to prioritize global above local ranges. - Prio = (1u << 29) + Size; + Prio = Size; + GlobalBit = 1; } - Prio |= RC.AllocationPriority << 24; + if (RegClassPriorityTrumpsGlobalness) + Prio |= RC.AllocationPriority << 25 | GlobalBit << 24; + else + Prio |= GlobalBit << 29 | RC.AllocationPriority << 24; // Mark a higher bit to prioritize global and local above RS_Split. Prio |= (1u << 31); @@ -2692,6 +2704,10 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { initializeCSRCost(); RegCosts = TRI->getRegisterCosts(*MF); + RegClassPriorityTrumpsGlobalness = + GreedyRegClassPriorityTrumpsGlobalness.getNumOccurrences() + ? GreedyRegClassPriorityTrumpsGlobalness + : TRI->regClassPriorityTrumpsGlobalness(*MF); ExtraInfo.emplace(); EvictAdvisor = diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h index ef4bb90..ad810c2 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.h +++ b/llvm/lib/CodeGen/RegAllocGreedy.h @@ -322,6 +322,10 @@ private: /// Function ArrayRef RegCosts; + /// Flags for the live range priority calculation, determined once per + /// machine function. + bool RegClassPriorityTrumpsGlobalness; + public: RAGreedy(const RegClassFilterFunc F = allocateAllRegClasses); diff --git a/llvm/test/CodeGen/AMDGPU/greedy-liverange-priority.mir b/llvm/test/CodeGen/AMDGPU/greedy-liverange-priority.mir new file mode 100644 index 0000000..01cda71 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/greedy-liverange-priority.mir @@ -0,0 +1,48 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1030 -greedy-regclass-priority-trumps-globalness=0 -start-before greedy -o - %s | FileCheck %s -check-prefix=OLD +# RUN: llc -march=amdgcn -mcpu=gfx1030 -greedy-regclass-priority-trumps-globalness=1 -start-before greedy -o - %s | FileCheck %s -check-prefix=NEW + +# At the time of writing -greedy-regclass-priority-trumps-globalness makes a +# significant improvement in the total number of vgprs needed to compile this +# test, from 11 down to 7. + +# OLD: NumVgprs: 11{{$}} +# NEW: NumVgprs: 7{{$}} + +--- +name: _amdgpu_cs_main +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $vgpr0, $vgpr6 + + %6:vgpr_32 = COPY $vgpr6 + undef %30.sub0:vreg_128 = COPY $vgpr0 + undef %27.sub0:vreg_128 = V_MED3_F32_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + undef %16.sub0:sgpr_256 = S_MOV_B32 0 + undef %26.sub1:vreg_64 = V_LSHRREV_B32_e32 1, %6, implicit $exec + %27.sub1:vreg_128 = COPY %27.sub0 + %27.sub2:vreg_128 = COPY %27.sub0 + %27.sub3:vreg_128 = COPY %27.sub0 + %26.sub0:vreg_64 = V_MOV_B32_e32 1, implicit $exec + %16.sub1:sgpr_256 = COPY %16.sub0 + %16.sub2:sgpr_256 = COPY %16.sub0 + %16.sub3:sgpr_256 = COPY %16.sub0 + %16.sub4:sgpr_256 = COPY %16.sub0 + %16.sub5:sgpr_256 = COPY %16.sub0 + %16.sub6:sgpr_256 = COPY %16.sub0 + %16.sub7:sgpr_256 = COPY %16.sub0 + IMAGE_STORE_V4_V2_gfx10 %27, %26, %16, 0, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into custom "ImageResource") + S_CBRANCH_SCC1 %bb.2, implicit undef $scc + S_BRANCH %bb.1 + + bb.1: + %30.sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec + %30.sub2:vreg_128 = COPY %30.sub1 + %30.sub3:vreg_128 = COPY %30.sub1 + %26.sub1:vreg_64 = COPY %30.sub1 + IMAGE_STORE_V4_V2_gfx10 %30, %26, %16, 0, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into custom "ImageResource") + + bb.2: + S_ENDPGM 0 +... -- 2.7.4