AMDGPU: Fix visiting physreg dest users when folding immediate copies
authorMatt Arsenault <Matthew.Arsenault@amd.com>
Sun, 9 Aug 2020 00:28:48 +0000 (20:28 -0400)
committerMatt Arsenault <Matthew.Arsenault@amd.com>
Mon, 10 Aug 2020 17:46:51 +0000 (13:46 -0400)
This can fold the immediate into the physical destination, but this
should not look for further users of the register. Fixes regression
introduced by 766cb615a3b96025192707f4670cdf171da84034.

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll [new file with mode: 0644]

index 67b91e1..3f1e980 100644 (file)
@@ -666,32 +666,34 @@ void SIFoldOperands::foldOperand(
       return;
 
     const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
-    if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
-      MachineRegisterInfo::use_iterator NextUse;
-      SmallVector<FoldCandidate, 4> CopyUses;
-      for (MachineRegisterInfo::use_iterator Use = MRI->use_begin(DestReg),
-                                             E = MRI->use_end();
-           Use != E; Use = NextUse) {
-        NextUse = std::next(Use);
-        // There's no point trying to fold into an implicit operand.
-        if (Use->isImplicit())
-          continue;
-
-        FoldCandidate FC = FoldCandidate(Use->getParent(), Use.getOperandNo(),
-                                         &UseMI->getOperand(1));
-        CopyUses.push_back(FC);
-      }
-      for (auto &F : CopyUses) {
-        foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList, CopiesToReplace);
+    if (!DestReg.isPhysical()) {
+      if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
+        MachineRegisterInfo::use_iterator NextUse;
+        SmallVector<FoldCandidate, 4> CopyUses;
+        for (MachineRegisterInfo::use_iterator Use = MRI->use_begin(DestReg),
+               E = MRI->use_end();
+             Use != E; Use = NextUse) {
+          NextUse = std::next(Use);
+          // There's no point trying to fold into an implicit operand.
+          if (Use->isImplicit())
+            continue;
+
+          FoldCandidate FC = FoldCandidate(Use->getParent(), Use.getOperandNo(),
+                                           &UseMI->getOperand(1));
+          CopyUses.push_back(FC);
+        }
+        for (auto &F : CopyUses) {
+          foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList, CopiesToReplace);
+        }
       }
-    }
 
-    if (DestRC == &AMDGPU::AGPR_32RegClass &&
-        TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
-      UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
-      UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
-      CopiesToReplace.push_back(UseMI);
-      return;
+      if (DestRC == &AMDGPU::AGPR_32RegClass &&
+          TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
+        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
+        UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+        CopiesToReplace.push_back(UseMI);
+        return;
+      }
     }
 
     // In order to fold immediates into copies, we need to change the
index 9164e5e..e921248 100644 (file)
@@ -87,3 +87,26 @@ body:             |
     S_ENDPGM 0, implicit $vgpr0
 
 ...
+
+# The users of $vgpr1 should not be visited for further immediate
+# folding.
+
+# GCN-LABEL: name: no_fold_physreg_users_vgpr{{$}}
+# GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+# GCN-NEXT: S_NOP 0, implicit-def $vgpr1
+# GCN-NEXT: %2:vgpr_32 = COPY $vgpr1
+# GCN-NEXT: $vgpr2 = COPY %2
+---
+name: no_fold_physreg_users_vgpr
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    %0:sreg_32 = S_MOV_B32 0
+    %1:vgpr_32 = COPY %0
+    $vgpr1 = COPY %0
+    S_NOP 0, implicit-def $vgpr1
+    %2:vgpr_32 = COPY $vgpr1
+    $vgpr2 = COPY %2
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll b/llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll
new file mode 100644 (file)
index 0000000..6995cf6
--- /dev/null
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; Make sure the return value of the first call is not overwritten with
+; a constant before the fadd use.
+
+; CHECK-LABEL: vgpr_multi_use_imm_fold:
+; CHECK: v_mov_b32_e32 v0, 0{{$}}
+; CHECK: v_mov_b32_e32 v1, 2.0{{$}}
+; CHECK:    s_swappc_b64
+; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 0
+; CHECK:    s_swappc_b64
+define amdgpu_kernel void @vgpr_multi_use_imm_fold() {
+entry:
+  store double 0.0, double addrspace(1)* undef, align 8
+  %call0 = tail call fastcc double @__ocml_log_f64(double 2.0)
+  %op = fadd double %call0, 0.0
+  %call1 = tail call fastcc double @__ocml_sqrt_f64(double %op)
+  ret void
+}
+
+declare hidden fastcc double @__ocml_log_f64(double)
+declare hidden fastcc double @__ocml_sqrt_f64(double)