From 738c73a454881ca78214816754c1b82941d0cd26 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 31 Aug 2020 15:09:50 -0400
Subject: [PATCH] RegAllocFast: Make self loop live-out heuristic more
 aggressive

This currently has no impact on code, but prevents sizeable code size
regressions after D52010. This prevents spilling and reloading all
values inside blocks that loop back. Add a baseline test which would
regress without this patch.
---
 llvm/lib/CodeGen/RegAllocFast.cpp                  |  37 ++++-
 .../AMDGPU/fastregalloc-self-loop-heuristic.mir    | 185 +++++++++++++++++++++
 2 files changed, 218 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir

diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index d93fd8f..db1b904 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -263,6 +263,20 @@ int RegAllocFast::getStackSpaceFor(Register VirtReg) {
   return FrameIdx;
 }
 
+static bool dominates(MachineBasicBlock &MBB,
+                      MachineBasicBlock::const_iterator A,
+                      MachineBasicBlock::const_iterator B) {
+  auto MBBEnd = MBB.end();
+  if (B == MBBEnd)
+    return true;
+
+  MachineBasicBlock::const_iterator I = MBB.begin();
+  for (; &*I != A && &*I != B; ++I)
+    ;
+
+  return &*I == A;
+}
+
 /// Returns false if \p VirtReg is known to not live out of the current block.
 bool RegAllocFast::mayLiveOut(Register VirtReg) {
   if (MayLiveAcrossBlocks.test(Register::virtReg2Index(VirtReg))) {
@@ -270,11 +284,16 @@ bool RegAllocFast::mayLiveOut(Register VirtReg) {
     return !MBB->succ_empty();
   }
 
-  // If this block loops back to itself, it would be necessary to check whether
-  // the use comes after the def.
+  const MachineInstr *SelfLoopDef = nullptr;
+
+  // If this block loops back to itself, it is necessary to check whether the
+  // use comes after the def.
   if (MBB->isSuccessor(MBB)) {
-    MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
-    return true;
+    SelfLoopDef = MRI->getUniqueVRegDef(VirtReg);
+    if (!SelfLoopDef) {
+      MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
+      return true;
+    }
   }
 
   // See if the first \p Limit uses of the register are all in the current
@@ -287,6 +306,16 @@ bool RegAllocFast::mayLiveOut(Register VirtReg) {
       // Cannot be live-out if there are no successors.
       return !MBB->succ_empty();
     }
+
+    if (SelfLoopDef) {
+      // Try to handle some simple cases to avoid spilling and reloading every
+      // value inside a self looping block.
+      if (SelfLoopDef == &UseInst ||
+          !dominates(*MBB, SelfLoopDef->getIterator(), UseInst.getIterator())) {
+        MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
+        return true;
+      }
+    }
   }
 
   return false;
diff --git a/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir b/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir
new file mode 100644
index 0000000..32de262
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir
@@ -0,0 +1,185 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=regallocfast -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name: self_loop_single_def_use
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  ; GCN-LABEL: name: self_loop_single_def_use
+  ; GCN: bb.0:
+  ; GCN:   successors: %bb.1(0x80000000)
+  ; GCN:   liveins: $vgpr0_vgpr1
+  ; GCN:   SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5)
+  ; GCN: bb.1:
+  ; GCN:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN:   $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
+  ; GCN:   renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
+  ; GCN:   GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec
+  ; GCN:   S_CBRANCH_EXECZ %bb.1, implicit $exec
+  ; GCN: bb.2:
+  ; GCN:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec
+    S_CBRANCH_EXECZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+
+---
+name: self_loop_multi_def
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  ; GCN-LABEL: name: self_loop_multi_def
+  ; GCN: bb.0:
+  ; GCN:   successors: %bb.1(0x80000000)
+  ; GCN:   liveins: $vgpr0_vgpr1
+  ; GCN:   SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5)
+  ; GCN: bb.1:
+  ; GCN:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN:   $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
+  ; GCN:   renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
+  ; GCN:   GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec
+  ; GCN:   renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
+  ; GCN:   GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec
+  ; GCN:   S_CBRANCH_EXECZ %bb.1, implicit $exec
+  ; GCN: bb.2:
+  ; GCN:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec
+    S_CBRANCH_EXECZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+
+# There's a single def inside the self loop, but it's also a use.
+
+---
+name: self_loop_def_use_same_inst
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  ; GCN-LABEL: name: self_loop_def_use_same_inst
+  ; GCN: bb.0:
+  ; GCN:   successors: %bb.1(0x80000000)
+  ; GCN:   liveins: $vgpr0_vgpr1
+  ; GCN:   SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5)
+  ; GCN: bb.1:
+  ; GCN:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN:   renamable $vgpr0 = V_ADD_U32_e32 1, undef $vgpr0, implicit $exec
+  ; GCN:   $vgpr1_vgpr2 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
+  ; GCN:   GLOBAL_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 0, 0, 0, implicit $exec
+  ; GCN:   S_CBRANCH_EXECZ %bb.1, implicit $exec
+  ; GCN: bb.2:
+  ; GCN:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+    %1:vgpr_32 = V_ADD_U32_e32 1, undef %1, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec
+    S_CBRANCH_EXECZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+
+---
+name: self_loop_def_after_use
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  ; GCN-LABEL: name: self_loop_def_after_use
+  ; GCN: bb.0:
+  ; GCN:   successors: %bb.1(0x80000000)
+  ; GCN:   liveins: $vgpr0_vgpr1
+  ; GCN:   SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5)
+  ; GCN: bb.1:
+  ; GCN:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN:   $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
+  ; GCN:   GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, undef renamable $vgpr0, 0, 0, 0, 0, implicit $exec
+  ; GCN:   renamable $vgpr2 = V_ADD_U32_e64 1, 1, 0, implicit $exec
+  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr2, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5)
+  ; GCN:   S_CBRANCH_EXECZ %bb.1, implicit $exec
+  ; GCN: bb.2:
+  ; GCN:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+    GLOBAL_STORE_DWORD %0, undef %1, 0, 0, 0, 0, implicit $exec
+    %1:vgpr_32 = V_ADD_U32_e64 1, 1, 0, implicit $exec
+    S_CBRANCH_EXECZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+
+---
+name: self_loop_single_subreg_def_use
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  ; GCN-LABEL: name: self_loop_single_subreg_def_use
+  ; GCN: bb.0:
+  ; GCN:   successors: %bb.1(0x80000000)
+  ; GCN:   liveins: $vgpr0_vgpr1
+  ; GCN:   SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5)
+  ; GCN: bb.1:
+  ; GCN:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN:   $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
+  ; GCN:   undef renamable $vgpr3 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr2_vgpr3
+  ; GCN:   GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, undef renamable $vgpr3, 0, 0, 0, 0, implicit $exec
+  ; GCN:   SI_SPILL_V64_SAVE killed $vgpr2_vgpr3, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.1, align 4, addrspace 5)
+  ; GCN:   S_CBRANCH_EXECZ %bb.1, implicit $exec
+  ; GCN: bb.2:
+  ; GCN:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+    undef %1.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, undef %1.sub1, 0, 0, 0, 0, implicit $exec
+    S_CBRANCH_EXECZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0
+
+...
-- 
2.7.4