[AMDGPU] Fix DPP combiner check for exec modification

author Jay Foad <jay.foad@gmail.com>

Fri, 12 Jul 2019 15:59:40 +0000 (15:59 +0000)

committer Jay Foad <jay.foad@gmail.com>

Fri, 12 Jul 2019 15:59:40 +0000 (15:59 +0000)
author Jay Foad <jay.foad@gmail.com>
Fri, 12 Jul 2019 15:59:40 +0000 (15:59 +0000)
committer Jay Foad <jay.foad@gmail.com>
Fri, 12 Jul 2019 15:59:40 +0000 (15:59 +0000)
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp

index 7348b5b..e1845e2 100644 (file)
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -344,7 +344,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
    auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
    assert(DstOpnd && DstOpnd->isReg());
    auto DPPMovReg = DstOpnd->getReg();
-  if (execMayBeModifiedBeforeUse(*MRI, DPPMovReg, MovMI)) {
+  if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
      LLVM_DEBUG(dbgs() << "  failed: EXEC mask should remain the same"
                           " for all uses\n");
      return false;
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

index bcc3478..74d77d3 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -650,7 +650,7 @@ void SIFoldOperands::foldOperand(
          if (execMayBeModifiedBeforeUse(*MRI,
                                         UseMI->getOperand(UseOpIdx).getReg(),
                                         *OpToFold.getParent(),
-                                       UseMI))
+                                       *UseMI))
            return;
  
          UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
@@ -669,7 +669,7 @@ void SIFoldOperands::foldOperand(
          if (execMayBeModifiedBeforeUse(*MRI,
                                         UseMI->getOperand(UseOpIdx).getReg(),
                                         *OpToFold.getParent(),
-                                       UseMI))
+                                       *UseMI))
            return;
  
          // %vgpr = COPY %sgpr0
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

index d855f3f..3474185 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6269,47 +6269,75 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
  }
  
  bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
-                                      unsigned VReg,
+                                      Register VReg,
                                        const MachineInstr &DefMI,
-                                      const MachineInstr *UseMI) {
+                                      const MachineInstr &UseMI) {
    assert(MRI.isSSA() && "Must be run on SSA");
  
    auto *TRI = MRI.getTargetRegisterInfo();
    auto *DefBB = DefMI.getParent();
  
-  if (UseMI) {
+  // Don't bother searching between blocks, although it is possible this block
+  // doesn't modify exec.
+  if (UseMI.getParent() != DefBB)
+    return true;
+
+  const int MaxInstScan = 20;
+  int NumInst = 0;
+
+  // Stop scan at the use.
+  auto E = UseMI.getIterator();
+  for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
+    if (I->isDebugInstr())
+      continue;
+
+    if (++NumInst > MaxInstScan)
+      return true;
+
+    if (I->modifiesRegister(AMDGPU::EXEC, TRI))
+      return true;
+  }
+
+  return false;
+}
+
+bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
+                                         Register VReg,
+                                         const MachineInstr &DefMI) {
+  assert(MRI.isSSA() && "Must be run on SSA");
+
+  auto *TRI = MRI.getTargetRegisterInfo();
+  auto *DefBB = DefMI.getParent();
+
+  const int MaxUseInstScan = 10;
+  int NumUseInst = 0;
+
+  for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) {
      // Don't bother searching between blocks, although it is possible this block
      // doesn't modify exec.
-    if (UseMI->getParent() != DefBB)
+    if (UseInst.getParent() != DefBB)
        return true;
-  } else {
-    int NumUse = 0;
-    const int MaxUseScan = 10;
-
-    for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) {
-      if (UseInst.getParent() != DefBB)
-        return true;
  
-      if (NumUse++ > MaxUseScan)
-        return true;
-    }
+    if (++NumUseInst > MaxUseInstScan)
+      return true;
    }
  
    const int MaxInstScan = 20;
-  int NumScan = 0;
+  int NumInst = 0;
  
-  // Stop scan at the use if known.
-  auto E = UseMI ? UseMI->getIterator() : DefBB->end();
-  for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
+  // Stop scan when we have seen all the uses.
+  for (auto I = std::next(DefMI.getIterator()); ; ++I) {
      if (I->isDebugInstr())
        continue;
  
-    if (NumScan++ > MaxInstScan)
+    if (++NumInst > MaxInstScan)
        return true;
  
+    if (I->readsRegister(VReg))
+      if (--NumUseInst == 0)
+        return false;
+
      if (I->modifiesRegister(AMDGPU::EXEC, TRI))
        return true;
    }
-
-  return false;
  }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h

index b8275b6..1f3c659 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1016,13 +1016,19 @@ MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
                                 MachineRegisterInfo &MRI);
  
  /// \brief Return false if EXEC is not changed between the def of \p VReg at \p
-/// DefMI and uses. If \p UseMI is not specified, this checks all uses of \p
-/// VReg. Should be run on SSA. Currently does not attempt to track between
-/// blocks.
+/// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not
+/// attempt to track between blocks.
  bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
-                                unsigned VReg,
+                                Register VReg,
                                  const MachineInstr &DefMI,
-                                const MachineInstr *UseMI = nullptr);
+                                const MachineInstr &UseMI);
+
+/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
+/// DefMI and all its uses. Should be run on SSA. Currently does not attempt to
+/// track between blocks.
+bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
+                                   Register VReg,
+                                   const MachineInstr &DefMI);
  
  namespace AMDGPU {
  
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

index 969eb3c..6f52a5f 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1,6 +1,6 @@
  ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
  
  declare i32 @llvm.amdgcn.workitem.id.x()
  
@@ -42,6 +42,8 @@ entry:
  ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
  ; GFX7LESS-NOT: s_bcnt1_i32_b64
  ; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
+; GFX8MORE: v_add_u32_dpp
+; GFX8MORE: v_add_u32_dpp
  ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
  ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
  ; GFX8MORE: buffer_atomic_add v[[value]]
@@ -133,6 +135,8 @@ entry:
  ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
  ; GFX7LESS-NOT: s_bcnt1_i32_b64
  ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
  ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
  ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
  ; GFX8MORE: buffer_atomic_sub v[[value]]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll

index 3ce91e8..f3d50c9 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -45,6 +45,8 @@ entry:
  ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
  ; GFX7LESS-NOT: s_bcnt1_i32_b64
  ; GFX7LESS: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX8MORE: v_add_u32_dpp
+; GFX8MORE: v_add_u32_dpp
  ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
  ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
  ; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
@@ -136,6 +138,8 @@ entry:
  ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
  ; GFX7LESS-NOT: s_bcnt1_i32_b64
  ; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
  ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
  ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
  ; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll

index c1ce932..a170f96 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -39,6 +39,8 @@ else:
  ; GFX8MORE: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
  ; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
  ; GFX8MORE: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
+; GFX8MORE: v_add_u32_dpp
+; GFX8MORE: v_add_u32_dpp
  ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
  ; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
  ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll

index c2db554..dd813df 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -44,6 +44,8 @@ entry:
  ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
  ; GFX7LESS-NOT: s_bcnt1_i32_b64
  ; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
+; GFX8MORE: v_add_u32_dpp
+; GFX8MORE: v_add_u32_dpp
  ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
  ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
  ; GFX8MORE: buffer_atomic_add v[[value]]
@@ -104,6 +106,8 @@ entry:
  ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
  ; GFX7LESS-NOT: s_bcnt1_i32_b64
  ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
  ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
  ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
  ; GFX8MORE: buffer_atomic_sub v[[value]]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll

index eb3f0ab..c32a73e 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -44,6 +44,8 @@ entry:
  ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
  ; GFX7LESS-NOT: s_bcnt1_i32_b64
  ; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
+; GFX8MORE: v_add_u32_dpp
+; GFX8MORE: v_add_u32_dpp
  ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
  ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
  ; GFX8MORE: buffer_atomic_add v[[value]]
@@ -117,6 +119,8 @@ entry:
  ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
  ; GFX7LESS-NOT: s_bcnt1_i32_b64
  ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
  ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
  ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
  ; GFX8MORE: buffer_atomic_sub v[[value]]
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir

index d98cde5..c43fb03 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir
@@ -380,6 +380,38 @@ body: |
      %9:vgpr_32 = V_SUB_I32_e32 5, %7, implicit-def $vcc, implicit $exec
  ...
  
+# tests on sequences of dpp consumers followed by control flow
+# CHECK-LABEL: name: dpp_seq_cf
+# CHECK: %4:vgpr_32 = V_ADD_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit-def $vcc, implicit $exec
+# CHECK: %5:vgpr_32 = V_SUBREV_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit-def $vcc, implicit $exec
+# CHECK: %6:vgpr_32 = V_OR_B32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
+
+name: dpp_seq_cf
+tracksRegLiveness: true
+body: |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $vgpr0, $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+    %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
+    %4:vgpr_32 = V_ADD_I32_e32 %3, %1, implicit-def $vcc, implicit $exec
+    %5:vgpr_32 = V_SUB_I32_e32 %1, %3, implicit-def $vcc, implicit $exec
+    %6:vgpr_32 = V_OR_B32_e32 %3, %1, implicit $exec
+
+    %7:sreg_64 = V_CMP_EQ_U32_e64 %5, %6, implicit $exec
+    %8:sreg_64 = SI_IF %7, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2
+
+  bb.2:
+    SI_END_CF %8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+...
+
  # old reg def is in diff BB - cannot combine
  # CHECK-LABEL: name: old_in_diff_bb
  # CHECK: %3:vgpr_32 = V_MOV_B32_dpp %2, %1, 1, 1, 1, 0, implicit $exec
author	Jay Foad <jay.foad@gmail.com>
	Fri, 12 Jul 2019 15:59:40 +0000 (15:59 +0000)
committer	Jay Foad <jay.foad@gmail.com>
	Fri, 12 Jul 2019 15:59:40 +0000 (15:59 +0000)
llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstrInfo.h		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/dpp_combine.mir		patch \| blob \| history