From d9e0a2942ac71327166a3a597e8383192fd19b17 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 30 Oct 2019 12:56:24 -0700
Subject: [PATCH] AMDGPU: Disallow spill folding with m0 copies

readlane and writelane instructions are not allowed to use m0 as the
data operand, so spilling them is tricky and would require an
intermediate SGPR to spill it. Constrain the virtual register class in
this caes to disallow the inline spiller from folding the m0 operand
directly into the spill instruction.

I copied this hack from AArch64 which has the same problem for $sp.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp           | 35 ++++++++++++++
 llvm/lib/Target/AMDGPU/SIInstrInfo.h             |  7 +++
 llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir | 58 ++++++++++++++++++++++++
 3 files changed, 100 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 272a7fc..fee2d72 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1062,6 +1062,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
   if (RI.isSGPRClass(RC)) {
     MFI->setHasSpilledSGPRs();
+    assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
 
     // We are only allowed to create one new instruction when spilling
     // registers, so we need to use pseudo instruction for spilling SGPRs.
@@ -1190,6 +1191,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 
   if (RI.isSGPRClass(RC)) {
     MFI->setHasSpilledSGPRs();
+    assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
 
     // FIXME: Maybe this should not include a memoperand because it will be
     // lowered to non-memory instructions.
@@ -6558,3 +6560,36 @@ MachineInstr *SIInstrInfo::createPHISourceCopy(
 }
 
 bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
+
+MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
+    VirtRegMap *VRM) const {
+  // This is a bit of a hack (copied from AArch64). Consider this instruction:
+  //
+  //   %0:sreg_32 = COPY $m0
+  //
+  // We explicitly chose SReg_32 for the virtual register so such a copy might
+  // be eliminated by RegisterCoalescer. However, that may not be possible, and
+  // %0 may even spill. We can't spill $m0 normally (it would require copying to
+  // a numbered SGPR anyway), and since it is in the SReg_32 register class,
+  // TargetInstrInfo::foldMemoryOperand() is going to try.
+  //
+  // To prevent that, constrain the %0 register class here.
+  if (MI.isFullCopy()) {
+    Register DstReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(1).getReg();
+
+    if (DstReg == AMDGPU::M0 && SrcReg.isVirtual()) {
+      MF.getRegInfo().constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
+      return nullptr;
+    }
+
+    if (SrcReg == AMDGPU::M0 && DstReg.isVirtual()) {
+      MF.getRegInfo().constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
+      return nullptr;
+    }
+  }
+
+  return nullptr;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index be46344..30c2f74 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1027,6 +1027,13 @@ public:
   }
 
   void fixImplicitOperands(MachineInstr &MI) const;
+
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                                      ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
+                                      int FrameIndex,
+                                      LiveIntervals *LIS = nullptr,
+                                      VirtRegMap *VRM = nullptr) const override;
 };
 
 /// \brief Returns true if a reg:subreg pair P has a TRC class
diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir
new file mode 100644
index 0000000..a4dab19
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir
@@ -0,0 +1,58 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -stress-regalloc=2 -start-before=greedy -stop-after=virtregmap -o - %s | FileCheck %s
+
+# Test that a spill of a copy of m0 is not folded to be a spill of m0 directly.
+
+---
+
+name:            merge_sgpr_spill_into_copy_from_m0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_m0
+    ; CHECK: liveins: $vgpr0
+    ; CHECK: S_WAITCNT 0
+    ; CHECK: S_NOP 0, implicit-def $m0
+    ; CHECK: $sgpr0 = S_MOV_B32 $m0
+    ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr0, 0, undef $vgpr0
+    ; CHECK: $sgpr0 = V_READLANE_B32_vi $vgpr0, 0
+    ; CHECK: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0
+    ; CHECK: $sgpr0 = V_READLANE_B32_vi killed $vgpr0, 0
+    ; CHECK: $m0 = S_MOV_B32 killed $sgpr0
+    ; CHECK: S_NOP 0
+    ; CHECK: S_SENDMSG 0, implicit $m0, implicit $exec
+    S_NOP 0, implicit-def $m0
+    %0:sreg_32 = COPY $m0
+    S_NOP 0, implicit-def %1:sreg_32, implicit-def %2:sreg_32, implicit %0
+    $m0 = COPY %0
+    S_SENDMSG 0, implicit $m0, implicit $exec
+
+...
+
+# Test that a reload into a copy of m0 is not folded to be a reload of m0 directly.
+
+---
+
+name:            reload_sgpr_spill_into_copy_to_m0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_m0
+    ; CHECK: liveins: $vgpr0
+    ; CHECK: S_WAITCNT 0
+    ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $m0
+    ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr0, 0, undef $vgpr0
+    ; CHECK: $sgpr0 = V_READLANE_B32_vi $vgpr0, 0
+    ; CHECK: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0
+    ; CHECK: $sgpr0 = V_READLANE_B32_vi killed $vgpr0, 0
+    ; CHECK: $m0 = S_MOV_B32 killed $sgpr0
+    ; CHECK: S_NOP 0
+    ; CHECK: S_SENDMSG 0, implicit $m0, implicit $exec
+    S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $m0
+    S_NOP 0, implicit %0, implicit-def %3:sreg_32, implicit-def %4:sreg_32
+    $m0 = COPY %0
+    S_SENDMSG 0, implicit $m0, implicit $exec
+
+...
-- 
2.7.4