[AMDGPU] Improve v_cmpx usage on GFX10.3.

author Thomas Symalla <thomas.symalla@amd.com>

Tue, 1 Feb 2022 09:28:18 +0000 (10:28 +0100)

committer Thomas Symalla <thomas.symalla@amd.com>

Fri, 25 Mar 2022 10:40:18 +0000 (11:40 +0100)
author Thomas Symalla <thomas.symalla@amd.com>
Tue, 1 Feb 2022 09:28:18 +0000 (10:28 +0100)
committer Thomas Symalla <thomas.symalla@amd.com>
Fri, 25 Mar 2022 10:40:18 +0000 (11:40 +0100)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

index 3c53323..453be72 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3860,18 +3860,19 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI,
  
  MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
                                             unsigned Op32) const {
-  MachineBasicBlock *MBB = MI.getParent();;
+  MachineBasicBlock *MBB = MI.getParent();
    MachineInstrBuilder Inst32 =
      BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
      .setMIFlags(MI.getFlags());
  
    // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
    // For VOPC instructions, this is replaced by an implicit def of vcc.
-  int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
-  if (Op32DstIdx != -1) {
+  if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst) != -1) {
      // dst
      Inst32.add(MI.getOperand(0));
-  } else {
+  } else if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::sdst) != -1) {
+    // VOPCX instructions won't be writing to an explicit dst, so this should
+    // not fail for these instructions.
      assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
              (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
             "Unexpected case");
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h

index 25d3f4a..4b75890 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1263,6 +1263,10 @@ namespace AMDGPU {
    LLVM_READONLY
    int getMFMAEarlyClobberOp(uint16_t Opcode);
  
+  /// \returns v_cmpx version of a v_cmp instruction.
+  LLVM_READONLY
+  int getVCMPXOpFromVCMP(uint16_t Opcode);
+
    const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
    const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
    const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td

index a81c49e..7878141 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2681,6 +2681,15 @@ def getMFMAEarlyClobberOp : InstrMapping {
    let ValueCols = [["0"]];
  }
  
+// Maps an v_cmp instruction to its v_cmpx equivalent.
+def getVCMPXOpFromVCMP : InstrMapping {
+  let FilterClass = "VCMPVCMPXTable";
+  let RowFields = ["VCMPOp"];
+  let ColFields = ["IsVCMPX"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
  include "SIInstructions.td"
  
  include "DSInstructions.td"
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp

index 9a4cc25..e45642b 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -9,6 +9,7 @@
  #include "AMDGPU.h"
  #include "GCNSubtarget.h"
  #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
  #include "llvm/CodeGen/MachineFunctionPass.h"
  #include "llvm/InitializePasses.h"
  
@@ -292,6 +293,182 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
    return false;
  }
  
+// Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
+// the beginning of the BB is reached or Pred evaluates to true - which can be
+// an arbitrary condition based on the current MachineInstr, for instance an
+// target instruction. Breaks prematurely by returning nullptr if  one of the
+// registers given in NonModifiableRegs is modified by the current instruction.
+static MachineInstr *
+findInstrBackwards(MachineInstr &Origin,
+                   std::function<bool(MachineInstr *)> Pred,
+                   ArrayRef<MCRegister> NonModifiableRegs,
+                   const SIRegisterInfo *TRI, unsigned MaxInstructions = 5) {
+  MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
+                                      E = Origin.getParent()->rend();
+  unsigned CurrentIteration = 0;
+
+  for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
+    if (Pred(&*A))
+      return &*A;
+
+    for (MCRegister Reg : NonModifiableRegs) {
+      if (A->modifiesRegister(Reg, TRI))
+        return nullptr;
+    }
+
+    ++CurrentIteration;
+  }
+
+  return nullptr;
+}
+
+// Determine if a register Reg is not re-defined and still in use
+// in the range (Stop..BB.end].
+// It does so by backwards calculating liveness from the end of the BB until
+// either Stop or the beginning of the BB is reached.
+// After liveness is calculated, we can determine if Reg is still in use and not
+// defined inbetween the instructions.
+static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg,
+                                 const SIRegisterInfo *TRI,
+                                 MachineRegisterInfo &MRI) {
+  LivePhysRegs LR(*TRI);
+  LR.addLiveOuts(*Stop.getParent());
+
+  for (auto A = Stop.getParent()->rbegin();
+       A != Stop.getParent()->rend() && A != Stop; ++A) {
+    LR.stepBackward(*A);
+  }
+
+  return !LR.available(MRI, Reg);
+}
+
+// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
+// by looking at an instance of a s_and_saveexec instruction. Returns a pointer
+// to the v_cmp instruction if it is safe to replace the sequence (see the
+// conditions in the function body). This is after register allocation, so some
+// checks on operand dependencies need to be considered.
+static MachineInstr *findPossibleVCMPVCMPXOptimization(
+    MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI,
+    const SIInstrInfo *TII, MachineRegisterInfo &MRI) {
+
+  MachineInstr *VCmp = nullptr;
+
+  Register SaveExecDest = SaveExec.getOperand(0).getReg();
+  if (!TRI->isSGPRReg(MRI, SaveExecDest))
+    return nullptr;
+
+  MachineOperand *SaveExecSrc0 =
+      TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
+  if (!SaveExecSrc0->isReg())
+    return nullptr;
+
+  // Try to find the last v_cmp instruction that defs the saveexec input
+  // operand without any write to Exec or the saveexec input operand inbetween.
+  VCmp = findInstrBackwards(
+      SaveExec,
+      [&](MachineInstr *Check) {
+        return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
+               Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
+      },
+      {Exec, SaveExecSrc0->getReg()}, TRI);
+
+  if (!VCmp)
+    return nullptr;
+
+  MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
+  assert(VCmpDest && "Should have an sdst operand!");
+
+  // Check if any of the v_cmp source operands is written by the saveexec.
+  MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
+  if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) &&
+      SaveExec.modifiesRegister(Src0->getReg(), TRI))
+    return nullptr;
+
+  MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
+  if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) &&
+      SaveExec.modifiesRegister(Src1->getReg(), TRI))
+    return nullptr;
+
+  // Don't do the transformation if the destination operand is included in
+  // it's MBB Live-outs, meaning it's used in any of it's successors, leading
+  // to incorrect code if the v_cmp and therefore the def of
+  // the dest operand is removed.
+  if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
+    return nullptr;
+
+  // If the v_cmp target is in use after the s_and_saveexec, skip the
+  // optimization.
+  if (isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI, MRI))
+    return nullptr;
+
+  // Try to determine if there is a write to any of the VCmp
+  // operands between the saveexec and the vcmp.
+  // If yes, additional VGPR spilling might need to be inserted. In this case,
+  // it's not worth replacing the instruction sequence.
+  SmallVector<MCRegister, 2> NonDefRegs;
+  if (Src0->isReg())
+    NonDefRegs.push_back(Src0->getReg());
+
+  if (Src1->isReg())
+    NonDefRegs.push_back(Src1->getReg());
+
+  if (!findInstrBackwards(
+          SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
+          NonDefRegs, TRI))
+    return nullptr;
+
+  return VCmp;
+}
+
+// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
+// operands extracted from a v_cmp ..., s_and_saveexec pattern.
+static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
+                                         MachineInstr &VCmp, MCRegister Exec,
+                                         const SIInstrInfo *TII,
+                                         const SIRegisterInfo *TRI,
+                                         MachineRegisterInfo &MRI) {
+  const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
+
+  if (NewOpcode == -1)
+    return false;
+
+  MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
+  MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
+
+  Register MoveDest = SaveExecInstr.getOperand(0).getReg();
+
+  MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator();
+  if (!SaveExecInstr.uses().empty()) {
+    bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32;
+    unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
+            SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
+        .addReg(Exec);
+  }
+
+  // Omit dst as V_CMPX is implicitly writing to EXEC.
+  // Add dummy src and clamp modifiers, if needed.
+  auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
+                         VCmp.getDebugLoc(), TII->get(NewOpcode));
+
+  if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) !=
+      -1)
+    Builder.addImm(0);
+
+  Builder.add(*Src0);
+
+  if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1_modifiers) !=
+      -1)
+    Builder.addImm(0);
+
+  Builder.add(*Src1);
+
+  if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) != -1)
+    Builder.addImm(0);
+
+  return true;
+}
+
  bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
    if (skipFunction(MF.getFunction()))
      return false;
@@ -299,6 +476,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
    const SIRegisterInfo *TRI = ST.getRegisterInfo();
    const SIInstrInfo *TII = ST.getInstrInfo();
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
    MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
  
    // Optimize sequences emitted for control flow lowering. They are originally
@@ -462,5 +640,45 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
      Changed = true;
    }
  
+  // After all s_op_saveexec instructions are inserted,
+  // replace (on GFX10.3 and later)
+  // v_cmp_* SGPR, IMM, VGPR
+  // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
+  // with
+  // s_mov_b32 EXEC_SGPR_DEST, exec_lo
+  // v_cmpx_* IMM, VGPR
+  // to reduce pipeline stalls.
+  if (ST.hasGFX10_3Insts()) {
+    DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
+    const unsigned AndSaveExecOpcode =
+        ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
+        // Record relevant v_cmp / s_and_saveexec instruction pairs for
+        // replacement.
+        if (MI.getOpcode() != AndSaveExecOpcode)
+          continue;
+
+        if (MachineInstr *VCmp =
+                findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI))
+          SaveExecVCmpMapping[&MI] = VCmp;
+      }
+    }
+
+    for (const auto &Entry : SaveExecVCmpMapping) {
+      MachineInstr *SaveExecInstr = Entry.getFirst();
+      MachineInstr *VCmpInstr = Entry.getSecond();
+
+      if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII,
+                                       TRI, *MRI)) {
+        SaveExecInstr->eraseFromParent();
+        VCmpInstr->eraseFromParent();
+
+        Changed = true;
+      }
+    }
+  }
+
    return Changed;
  }
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp

index e437552..880df8b 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -728,21 +728,27 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
        int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
  
        if (TII->isVOPC(Op32)) {
-        Register DstReg = MI.getOperand(0).getReg();
-        if (DstReg.isVirtual()) {
-          // VOPC instructions can only write to the VCC register. We can't
-          // force them to use VCC here, because this is only one register and
-          // cannot deal with sequences which would require multiple copies of
-          // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
-          //
-          // So, instead of forcing the instruction to write to VCC, we provide
-          // a hint to the register allocator to use VCC and then we will run
-          // this pass again after RA and shrink it if it outputs to VCC.
-          MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg);
-          continue;
+        MachineOperand &Op0 = MI.getOperand(0);
+        if (Op0.isReg()) {
+          // Exclude VOPCX instructions as these don't explicitly write a
+          // dst.
+          Register DstReg = Op0.getReg();
+          if (DstReg.isVirtual()) {
+            // VOPC instructions can only write to the VCC register. We can't
+            // force them to use VCC here, because this is only one register and
+            // cannot deal with sequences which would require multiple copies of
+            // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
+            //
+            // So, instead of forcing the instruction to write to VCC, we
+            // provide a hint to the register allocator to use VCC and then we
+            // will run this pass again after RA and shrink it if it outputs to
+            // VCC.
+            MRI.setRegAllocationHint(DstReg, 0, VCCReg);
+            continue;
+          }
+          if (DstReg != VCCReg)
+            continue;
          }
-        if (DstReg != VCCReg)
-          continue;
        }
  
        if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td

index c0cc910..1220b5c 100644 (file)
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -205,6 +205,11 @@ class VCMPXNoSDstTable <bit has_sdst, string Name> {
    string NoSDstOp = Name;
  }
  
+class VCMPVCMPXTable <string Name> {
+  bit IsVCMPX = 0;
+  string VCMPOp = Name;
+}
+
  multiclass VOPC_Pseudos <string opName,
                           VOPC_Profile P,
                           SDPatternOperator cond = COND_NULL,
@@ -213,7 +218,8 @@ multiclass VOPC_Pseudos <string opName,
  
    def _e32 : VOPC_Pseudo <opName, P>,
               Commutable_REV<revOp#"_e32", !eq(revOp, opName)>,
-             VCMPXNoSDstTable<1, opName#"_e32"> {
+             VCMPXNoSDstTable<1, opName#"_e32">,
+             VCMPVCMPXTable<opName#"_e32"> {
      let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
      let SchedRW = P.Schedule;
      let isConvergent = DefExec;
@@ -223,7 +229,8 @@ multiclass VOPC_Pseudos <string opName,
  
    def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>,
      Commutable_REV<revOp#"_e64", !eq(revOp, opName)>,
-    VCMPXNoSDstTable<1, opName#"_e64"> {
+    VCMPXNoSDstTable<1, opName#"_e64">,
+    VCMPVCMPXTable<opName#"_e64"> {
      let Defs = !if(DefExec, [EXEC], []);
      let SchedRW = P.Schedule;
      let isCompare = 1;
@@ -248,23 +255,27 @@ multiclass VOPCX_Pseudos <string opName,
  
    def _nosdst_e32 : VOPC_Pseudo <opName#"_nosdst", P_NoSDst, [], 0>,
               Commutable_REV<revOp#"_nosdst_e32", !eq(revOp, opName)>,
-             VCMPXNoSDstTable<0, opName#"_e32"> {
+             VCMPXNoSDstTable<0, opName#"_e32">,
+             VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e32")> {
      let Defs = [EXEC];
      let SchedRW = P_NoSDst.Schedule;
      let isConvergent = 1;
      let isCompare = 1;
      let isCommutable = 1;
      let SubtargetPredicate = HasNoSdstCMPX;
+    let IsVCMPX = 1;
    }
  
    def _nosdst_e64 : VOP3_Pseudo<opName#"_nosdst", P_NoSDst>,
      Commutable_REV<revOp#"_nosdst_e64", !eq(revOp, opName)>,
-    VCMPXNoSDstTable<0, opName#"_e64"> {
+    VCMPXNoSDstTable<0, opName#"_e64">,
+    VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e64")> {
      let Defs = [EXEC];
      let SchedRW = P_NoSDst.Schedule;
      let isCompare = 1;
      let isCommutable = 1;
      let SubtargetPredicate = HasNoSdstCMPX;
+    let IsVCMPX = 1;
    }
  
    foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll

index 2fed25f..a2c35b9 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
@@ -6,7 +6,7 @@
  
  ; GCN-LABEL: long_forward_scc_branch_3f_offset_bug:
  ; GFX1030: s_cmp_lg_u32
-; GFX1030-NEXT: s_cbranch_scc1  [[ENDBB:.LBB[0-9]+_[0-9]+]]
+; GFX1030: s_cbranch_scc1  [[ENDBB:.LBB[0-9]+_[0-9]+]]
  
  ; GFX1010: s_cmp_lg_u32
  ; GFX1010-NEXT: s_cbranch_scc0  [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
@@ -51,9 +51,9 @@ bb3:
  }
  
  ; GCN-LABEL: {{^}}long_forward_exec_branch_3f_offset_bug:
-; GFX1030: v_cmp_eq_u32
-; GFX1030: s_and_saveexec_b32
-; GFX1030-NEXT: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
+; GFX1030: s_mov_b32
+; GFX1030: v_cmpx_eq_u32
+; GFX1030: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
  
  ; GFX1010: v_cmp_eq_u32
  ; GFX1010: s_and_saveexec_b32
diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll

new file mode 100644 (file)

index 0000000..f8aae95
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
@@ -0,0 +1,167 @@
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_lt:
+; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 15, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_lt_i32_e32 15, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_lt(i32 %x) {
+entry:
+  %bc = icmp slt i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_gt:
+; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 17, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_gt_i32_e32 17, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_gt(i32 %x) {
+entry:
+  %bc = icmp sgt i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_eq:
+; GFX1010: v_cmp_ne_u32_e32 vcc_lo, 16, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_ne_u32_e32 16, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_eq(i32 %x) {
+entry:
+  %bc = icmp eq i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ne:
+; GFX1010: v_cmp_eq_u32_e32 vcc_lo, 16, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_eq_u32_e32 16, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_ne(i32 %x) {
+entry:
+  %bc = icmp ne i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_le:
+; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 16, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_lt_i32_e32 16, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_le(i32 %x) {
+entry:
+  %bc = icmp sle i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ge:
+; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 16, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_gt_i32_e32 16, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_ge(i32 %x) {
+entry:
+  %bc = icmp sge i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
+
+declare amdgpu_gfx void @check_live_outs_helper(i64) #0
+
+; In cases where the output operand cannot be safely removed,
+; don't apply the v_cmpx transformation.
+
+; GCN-LABEL: {{^}}check_live_outs:
+; GFX1010: v_cmp_eq_u32_e64 s{{.*}}, v{{.*}}, v{{.*}}
+; GFX1010: s_and_saveexec_b32 s{{.*}}, s{{.*}}
+; GFX1030: v_cmp_eq_u32_e64 s{{.*}}, v{{.*}}, v{{.*}}
+; GFX1030: s_and_saveexec_b32 s{{.*}}, s{{.*}}
+define amdgpu_cs void @check_live_outs(i32 %a, i32 %b) {
+  %cond = icmp eq i32 %a, %b
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+  br i1 %cond, label %l1, label %l2
+l1:
+  call amdgpu_gfx void @check_live_outs_helper(i64 %result)
+  br label %l2
+l2:
+  ret void
+}
+
+; Omit the transformation if the s_and_saveexec instruction overwrites
+; any of the v_cmp source operands.
+
+; GCN-LABEL: check_saveexec_overwrites_vcmp_source:
+; GCN:  ; %bb.1: ; %then
+; GFX1010:          v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}}
+; GFX1010-NEXT:     v_mov_b32_e32 {{.*}}, s[[A]]
+; GFX1010-NEXT:     s_and_saveexec_b32 s[[A]], vcc_lo
+; GFX1030:          v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}}
+; GFX1030-NEXT:     v_mov_b32_e32 {{.*}}, s[[A]]
+; GFX1030-NEXT:     s_and_saveexec_b32 s[[A]], vcc_lo
+define i32 @check_saveexec_overwrites_vcmp_source(i32 inreg %a, i32 inreg %b) {
+entry:
+  %0 = icmp sge i32 %a, 0
+  br i1 %0, label %if, label %then
+
+if:
+  %1 = shl i32 %a, 2
+  %2 = or i32 %1, %b
+  ret i32 %2
+
+then:
+  %3 = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+  %4 = trunc i64 %3 to i32
+  %5 = icmp slt i32 %4, %b
+  br i1 %5, label %after, label %end
+
+after:
+  ret i32 %4
+
+end:
+  ret i32 %a
+}
+
+declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir

new file mode 100644 (file)

index 0000000..52cd6f5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
@@ -0,0 +1,24 @@
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+
+# After the Optimize exec masking (post-RA) pass, there's a change of having v_cmpx instructions
+# being introduced whenever there's a sequence of v_cmp and s_and_saveexec instructions
+# which can be safely replaced in various cases.
+# However, it is not safe to do so when the generated code sequence would omit part of the EXEC mask
+# which could occur when a subset of EXEC is used as input operand in the v_cmp instruction.
+# The idea behind this test is to check if the subregisters are correctly handled here.
+
+# GCN-LABEL: name: vcmp_saveexec_to_mov_vcmpx_exec_subreg
+# GCN: V_CMP_GT_U32_e64
+# GCN: S_AND_SAVEEXEC_B64
+name: vcmp_saveexec_to_mov_vcmpx_exec_subreg
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $sgpr2
+    renamable $sgpr0_sgpr1 = V_CMP_GT_U32_e64 $sgpr2, killed $vgpr0, implicit $exec
+    $sgpr2_sgpr3 = COPY $exec, implicit-def $exec
+    $sgpr2_sgpr3 = S_AND_B64 killed renamable $sgpr2_sgpr3, killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+    $exec = S_MOV_B64_term killed renamable $sgpr2_sgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll

index e462a46..7cfb43b 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1250,8 +1250,8 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr
  ; GFX10-W32:       ; %bb.0: ; %main_body
  ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
  ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT:    v_cmpx_ne_u32_e32 0, v1
  ; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
  ; GFX10-W32-NEXT:    s_cbranch_execz .LBB23_2
  ; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
@@ -1329,8 +1329,8 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr
  ; GFX10-W32:       ; %bb.0: ; %main_body
  ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
  ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT:    v_cmpx_ne_u32_e32 0, v1
  ; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
  ; GFX10-W32-NEXT:    s_cbranch_execz .LBB24_2
  ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
@@ -1508,10 +1508,10 @@ define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inr
  ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
  ; GFX10-W32-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
  ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0, v1
  ; GFX10-W32-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
+; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
  ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
-; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX10-W32-NEXT:    v_cmpx_nlt_f32_e32 0, v1
  ; GFX10-W32-NEXT:    s_xor_b32 s0, exec_lo, s0
  ; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
  ; GFX10-W32-NEXT:    v_mul_f32_e32 v0, 4.0, v1
@@ -1577,8 +1577,8 @@ define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i3
  ; GFX10-W32:       ; %bb.0: ; %main_body
  ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
  ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT:    v_cmpx_eq_u32_e32 0, v1
  ; GFX10-W32-NEXT:    s_cbranch_execz .LBB27_2
  ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
  ; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
@@ -2960,9 +2960,9 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
  ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
  ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
  ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s13
-; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
  ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT:    v_cmpx_eq_u32_e32 0, v1
  ; GFX10-W32-NEXT:    s_cbranch_execz .LBB46_2
  ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
  ; GFX10-W32-NEXT:    s_mov_b32 s14, exec_lo
author	Thomas Symalla <thomas.symalla@amd.com>
	Tue, 1 Feb 2022 09:28:18 +0000 (10:28 +0100)
committer	Thomas Symalla <thomas.symalla@amd.com>
	Fri, 25 Mar 2022 10:40:18 +0000 (11:40 +0100)
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstrInfo.h		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstrInfo.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/VOPCInstructions.td		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/AMDGPU/wqm.ll		patch \| blob \| history