[AMDGPU] Introduce Strict WQM mode

author Piotr Sobczak <Piotr.Sobczak@amd.com>

Wed, 3 Mar 2021 09:18:28 +0000 (10:18 +0100)

committer Piotr Sobczak <Piotr.Sobczak@amd.com>

Wed, 3 Mar 2021 13:19:16 +0000 (14:19 +0100)
author Piotr Sobczak <Piotr.Sobczak@amd.com>
Wed, 3 Mar 2021 09:18:28 +0000 (10:18 +0100)
committer Piotr Sobczak <Piotr.Sobczak@amd.com>
Wed, 3 Mar 2021 13:19:16 +0000 (14:19 +0100)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td

index 82bb012..bba6972 100644 (file)
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1621,6 +1621,10 @@ def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
    [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
                         IntrConvergent, IntrWillReturn]
  >;
+def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty],
+  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
+                       IntrConvergent, IntrWillReturn]
+>;
  
  // Given a value, copies it while setting all the inactive lanes to a given
  // value. Note that OpenGL helper lanes are considered active, so if the
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

index 415c92a..619a891 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2645,6 +2645,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
    case Intrinsic::amdgcn_strict_wwm:
      Opcode = AMDGPU::STRICT_WWM;
      break;
+  case Intrinsic::amdgcn_strict_wqm:
+    Opcode = AMDGPU::STRICT_WQM;
+    break;
    case Intrinsic::amdgcn_interp_p1_f16:
      SelectInterpP1F16(N);
      return;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

index ffd801d..8e83100 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -930,6 +930,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
    case Intrinsic::amdgcn_strict_wwm:
    case Intrinsic::amdgcn_wwm:
      return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
+  case Intrinsic::amdgcn_strict_wqm:
+    return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
    case Intrinsic::amdgcn_writelane:
      return selectWritelane(I);
    case Intrinsic::amdgcn_div_scale:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

index 244d4b3..477f04e 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3958,6 +3958,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
      case Intrinsic::amdgcn_mov_dpp:
      case Intrinsic::amdgcn_strict_wwm:
      case Intrinsic::amdgcn_wwm:
+    case Intrinsic::amdgcn_strict_wqm:
      case Intrinsic::amdgcn_wqm:
      case Intrinsic::amdgcn_softwqm:
      case Intrinsic::amdgcn_set_inactive:
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

index 93a8935..d5c56bf 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -581,6 +581,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
          continue;
        case AMDGPU::COPY:
        case AMDGPU::WQM:
+      case AMDGPU::STRICT_WQM:
        case AMDGPU::SOFT_WQM:
        case AMDGPU::STRICT_WWM: {
          Register DstReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

index f195208..5a7ef04 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1949,9 +1949,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
                                   : AMDGPU::S_OR_SAVEEXEC_B64));
      break;
    }
-  case AMDGPU::EXIT_STRICT_WWM: {
+  case AMDGPU::ENTER_STRICT_WQM: {
      // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
-    // Whole Wave Mode is exited.
+    // STRICT_WQM is entered.
+    const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+    const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
+    const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
+    BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
+
+    MI.eraseFromParent();
+    break;
+  }
+  case AMDGPU::EXIT_STRICT_WWM:
+  case AMDGPU::EXIT_STRICT_WQM: {
+    // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
+    // WWM/STICT_WQM is exited.
      MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
      break;
    }
@@ -4407,6 +4420,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
    case AMDGPU::WQM: return AMDGPU::WQM;
    case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
    case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
+  case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
    case AMDGPU::S_MOV_B32: {
      const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
      return MI.getOperand(1).isReg() ||
@@ -6643,6 +6657,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
      case AMDGPU::WQM:
      case AMDGPU::SOFT_WQM:
      case AMDGPU::STRICT_WWM:
+    case AMDGPU::STRICT_WQM:
      case AMDGPU::REG_SEQUENCE:
      case AMDGPU::PHI:
      case AMDGPU::INSERT_SUBREG:
@@ -6800,7 +6815,8 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
    case AMDGPU::INSERT_SUBREG:
    case AMDGPU::WQM:
    case AMDGPU::SOFT_WQM:
-  case AMDGPU::STRICT_WWM: {
+  case AMDGPU::STRICT_WWM:
+  case AMDGPU::STRICT_WQM: {
      const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
      if (RI.hasAGPRs(SrcRC)) {
        if (RI.hasAGPRs(NewDstRC))
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td

index 9263a14..20d591c 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -125,6 +125,7 @@ def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
  // accidentally clobber inactive channels of $vdst.
  let Constraints = "@earlyclobber $vdst" in {
  def STRICT_WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
  }
  
  } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
@@ -143,6 +144,20 @@ def EXIT_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
    let mayStore = 0;
  }
  
+def ENTER_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
+  let Uses = [EXEC];
+  let Defs = [EXEC, SCC];
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+}
+
+def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+}
+
  // Invert the exec mask and overwrite the inactive lanes of dst with inactive,
  // restoring it after we're done.
  let Defs = [SCC] in {
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp

index 51f4ea7..07ff8ef 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -38,6 +38,9 @@ private:
    RegisterClassInfo RegClassInfo;
  
    std::vector<unsigned> RegsToRewrite;
+#ifndef NDEBUG
+  void printWWMInfo(const MachineInstr &MI);
+#endif
  
  public:
    static char ID;
@@ -154,6 +157,31 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
    MRI->freezeReservedRegs(MF);
  }
  
+#ifndef NDEBUG
+LLVM_DUMP_METHOD void
+SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) {
+
+  unsigned Opc = MI.getOpcode();
+
+  if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) {
+    dbgs() << "Entering ";
+  } else {
+    assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM);
+    dbgs() << "Exiting ";
+  }
+
+  if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) {
+    dbgs() << "Strict WWM ";
+  } else {
+    assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM);
+    dbgs() << "Strict WQM ";
+  }
+
+  dbgs() << "region: " << MI;
+}
+
+#endif
+
  bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
    LLVM_DEBUG(dbgs() << "SIPreAllocateWWMRegs: function " << MF.getName() << "\n");
  
@@ -185,21 +213,23 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
            MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
          RegsAssigned |= processDef(MI.getOperand(0));
  
-      if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM) {
-        LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n");
+      if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM ||
+          MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) {
+        LLVM_DEBUG(printWWMInfo(MI));
          InWWM = true;
          continue;
        }
  
-      if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM) {
-        LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n");
+      if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM ||
+          MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) {
+        LLVM_DEBUG(printWWMInfo(MI));
          InWWM = false;
        }
  
        if (!InWWM)
          continue;
  
-      LLVM_DEBUG(dbgs() << "processing " << MI << "\n");
+      LLVM_DEBUG(dbgs() << "Processing " << MI);
  
        for (MachineOperand &DefOpnd : MI.defs()) {
          RegsAssigned |= processDef(DefOpnd);
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

index 2fe7a73..f2047f2 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -7,8 +7,13 @@
  //===----------------------------------------------------------------------===//
  //
  /// \file
-/// This pass adds instructions to enable whole quad mode for pixel
-/// shaders, and whole wavefront mode for all programs.
+/// This pass adds instructions to enable whole quad mode (strict or non-strict)
+/// for pixel shaders, and strict whole wavefront mode for all programs.
+///
+/// The "strict" prefix indicates that inactive lanes do not take part in
+/// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
+/// always be enabled irrespective of control flow decisions. Conversely in
+/// non-strict WQM inactive lanes may control flow decisions.
  ///
  /// Whole quad mode is required for derivative computations, but it interferes
  /// with shader side effects (stores and atomics). It ensures that WQM is
@@ -26,12 +31,21 @@
  ///   ...
  ///   S_MOV_B64 EXEC, Tmp
  ///
-/// We also compute when a sequence of instructions requires Whole Wavefront
-/// Mode (StrictWWM) and insert instructions to save and restore it:
+/// We also compute when a sequence of instructions requires strict whole
+/// wavefront mode (StrictWWM) and insert instructions to save and restore it:
+///
+///   S_OR_SAVEEXEC_B64 Tmp, -1
+///   ...
+///   S_MOV_B64 EXEC, Tmp
+///
+/// When a sequence of instructions requires strict whole quad mode (StrictWQM)
+/// we use a similar save and restore mechanism and force whole quad mode for
+/// those instructions:
  ///
-/// S_OR_SAVEEXEC_B64 Tmp, -1
-/// ...
-/// S_MOV_B64 EXEC, Tmp
+///  S_MOV_B64 Tmp, EXEC
+///  S_WQM_B64 EXEC, EXEC
+///  ...
+///  S_MOV_B64 EXEC, Tmp
  ///
  /// In order to avoid excessive switching during sequences of Exact
  /// instructions, the pass first analyzes which instructions must be run in WQM
@@ -77,7 +91,9 @@ namespace {
  enum {
    StateWQM = 0x1,
    StateStrictWWM = 0x2,
-  StateExact = 0x4,
+  StateStrictWQM = 0x4,
+  StateExact = 0x8,
+  StateStrict = StateStrictWWM | StateStrictWQM,
  };
  
  struct PrintState {
@@ -89,19 +105,23 @@ public:
  
  #ifndef NDEBUG
  static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
-  if (PS.State & StateWQM)
-    OS << "WQM";
-  if (PS.State & StateStrictWWM) {
-    if (PS.State & StateWQM)
-      OS << '|';
-    OS << "StrictWWM";
-  }
-  if (PS.State & StateExact) {
-    if (PS.State & (StateWQM | StateStrictWWM))
-      OS << '|';
-    OS << "Exact";
-  }
  
+  static const std::pair<char, const char *> Mapping[] = {
+      std::make_pair(StateWQM, "WQM"),
+      std::make_pair(StateStrictWWM, "StrictWWM"),
+      std::make_pair(StateStrictWQM, "StrictWQM"),
+      std::make_pair(StateExact, "Exact")};
+  char State = PS.State;
+  for (auto M : Mapping) {
+    if (State & M.first) {
+      OS << M.second;
+      State &= ~M.first;
+
+      if (State)
+        OS << '|';
+    }
+  }
+  assert(State == 0);
    return OS;
  }
  #endif
@@ -151,7 +171,7 @@ private:
    DenseMap<const MachineInstr *, InstrInfo> Instructions;
    MapVector<MachineBasicBlock *, BlockInfo> Blocks;
  
-  // Tracks state (WQM/StrictWWM/Exact) after a given instruction
+  // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
    DenseMap<const MachineInstr *, char> StateTransition;
  
    SmallVector<MachineInstr *, 2> LiveMaskQueries;
@@ -184,10 +204,11 @@ private:
                 Register SaveWQM);
    void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
               Register SavedWQM);
-  void toStrictWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
-                   Register SaveOrig);
-  void fromStrictWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
-                     Register SavedOrig, char NonStrictWWMState);
+  void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+                    Register SaveOrig, char StrictStateNeeded);
+  void fromStrictMode(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator Before, Register SavedOrig,
+                      char NonStrictState, char CurrentStrictState);
  
    MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
  
@@ -473,9 +494,17 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
          GlobalFlags |= StateStrictWWM;
          LowerToMovInstrs.push_back(&MI);
          continue;
+      } else if (Opcode == AMDGPU::STRICT_WQM) {
+        // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
+        // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
+        // quads that have at least one active thread.
+        markInstructionUses(MI, StateStrictWQM, Worklist);
+        GlobalFlags |= StateStrictWQM;
+        LowerToMovInstrs.push_back(&MI);
+        continue;
        } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
                   Opcode == AMDGPU::V_SET_INACTIVE_B64) {
-        III.Disabled = StateStrictWWM;
+        III.Disabled = StateStrict;
          MachineOperand &Inactive = MI.getOperand(2);
          if (Inactive.isReg()) {
            if (Inactive.isUndef()) {
@@ -493,7 +522,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
            Worklist.push_back(&MBB);
          }
          GlobalFlags |= StateExact;
-        III.Disabled = StateWQM | StateStrictWWM;
+        III.Disabled = StateWQM | StateStrict;
          continue;
        } else {
          if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
@@ -570,7 +599,7 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
  
    // Propagate backwards within block
    if (MachineInstr *PrevMI = MI.getPrevNode()) {
-    char InNeeds = (II.Needs & ~StateStrictWWM) | II.OutNeeds;
+    char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
      if (!PrevMI->isPHI()) {
        InstrInfo &PrevII = Instructions[PrevMI];
        if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
@@ -586,10 +615,12 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
    if (II.Needs != 0)
      markInstructionUses(MI, II.Needs, Worklist);
  
-  // Ensure we process a block containing StrictWWM, even if it does not require
-  // any WQM transitions.
+  // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
+  // not require any WQM transitions.
    if (II.Needs & StateStrictWWM)
      BI.Needs |= StateStrictWWM;
+  if (II.Needs & StateStrictWQM)
+    BI.Needs |= StateStrictWQM;
  }
  
  void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
@@ -1105,30 +1136,48 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
    StateTransition[MI] = StateWQM;
  }
  
-void SIWholeQuadMode::toStrictWWM(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator Before,
-                                  Register SaveOrig) {
+void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator Before,
+                                   Register SaveOrig, char StrictStateNeeded) {
    MachineInstr *MI;
-
    assert(SaveOrig);
-  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
-               SaveOrig)
-           .addImm(-1);
+  assert(StrictStateNeeded == StateStrictWWM ||
+         StrictStateNeeded == StateStrictWQM);
+
+  if (StrictStateNeeded == StateStrictWWM) {
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
+                 SaveOrig)
+             .addImm(-1);
+  } else {
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
+                 SaveOrig)
+             .addImm(-1);
+  }
    LIS->InsertMachineInstrInMaps(*MI);
    StateTransition[MI] = StateStrictWWM;
  }
  
-void SIWholeQuadMode::fromStrictWWM(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator Before,
-                                    Register SavedOrig,
-                                    char NonStrictWWMState) {
+void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator Before,
+                                     Register SavedOrig, char NonStrictState,
+                                     char CurrentStrictState) {
    MachineInstr *MI;
  
    assert(SavedOrig);
-  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM), Exec)
-           .addReg(SavedOrig);
+  assert(CurrentStrictState == StateStrictWWM ||
+         CurrentStrictState == StateStrictWQM);
+
+  if (CurrentStrictState == StateStrictWWM) {
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
+                 Exec)
+             .addReg(SavedOrig);
+  } else {
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
+                 Exec)
+             .addReg(SavedOrig);
+  }
    LIS->InsertMachineInstrInMaps(*MI);
-  StateTransition[MI] = NonStrictWWMState;
+  StateTransition[MI] = NonStrictState;
  }
  
  void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
@@ -1149,10 +1198,10 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
                      << ":\n");
  
    Register SavedWQMReg;
-  Register SavedNonStrictWWMReg;
+  Register SavedNonStrictReg;
    bool WQMFromExec = IsEntry;
    char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
-  char NonStrictWWMState = 0;
+  char NonStrictState = 0;
    const TargetRegisterClass *BoolRC = TRI->getBoolRC();
  
    auto II = MBB.getFirstNonPHI(), IE = MBB.end();
@@ -1166,25 +1215,25 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
    // Exact or vice versa.
    MachineBasicBlock::iterator FirstWQM = IE;
  
-  // This stores the first instruction where it's safe to switch from StrictWWM
-  // to Exact/WQM or to switch to StrictWWM. It must always be the same as, or
-  // after, FirstWQM since if it's safe to switch to/from StrictWWM, it must be
-  // safe to switch to/from WQM as well.
-  MachineBasicBlock::iterator FirstStrictWWM = IE;
+  // This stores the first instruction where it's safe to switch from Strict
+  // mode to Exact/WQM or to switch to Strict mode. It must always be the same
+  // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
+  // be safe to switch to/from WQM as well.
+  MachineBasicBlock::iterator FirstStrict = IE;
  
    // Record initial state is block information.
    BI.InitialState = State;
  
    for (;;) {
      MachineBasicBlock::iterator Next = II;
-    char Needs = StateExact | StateWQM; // StrictWWM is disabled by default
+    char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
      char OutNeeds = 0;
  
      if (FirstWQM == IE)
        FirstWQM = II;
  
-    if (FirstStrictWWM == IE)
-      FirstStrictWWM = II;
+    if (FirstStrict == IE)
+      FirstStrict = II;
  
      // First, figure out the allowed states (Needs) based on the propagated
      // flags.
@@ -1196,6 +1245,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
          if (III != Instructions.end()) {
            if (III->second.Needs & StateStrictWWM)
              Needs = StateStrictWWM;
+          else if (III->second.Needs & StateStrictWQM)
+            Needs = StateStrictWQM;
            else if (III->second.Needs & StateWQM)
              Needs = StateWQM;
            else
@@ -1204,8 +1255,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
          }
        } else {
          // If the instruction doesn't actually need a correct EXEC, then we can
-        // safely leave StrictWWM enabled.
-        Needs = StateExact | StateWQM | StateStrictWWM;
+        // safely leave Strict mode enabled.
+        Needs = StateExact | StateWQM | StateStrict;
        }
  
        if (MI.isTerminator() && OutNeeds == StateExact)
@@ -1225,27 +1276,28 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
      // Now, transition if necessary.
      if (!(Needs & State)) {
        MachineBasicBlock::iterator First;
-      if (State == StateStrictWWM || Needs == StateStrictWWM) {
-        // We must switch to or from StrictWWM
-        First = FirstStrictWWM;
+      if (State == StateStrictWWM || Needs == StateStrictWWM ||
+          State == StateStrictWQM || Needs == StateStrictWQM) {
+        // We must switch to or from Strict mode.
+        First = FirstStrict;
        } else {
-        // We only need to switch to/from WQM, so we can use FirstWQM
+        // We only need to switch to/from WQM, so we can use FirstWQM.
          First = FirstWQM;
        }
  
-      // Whether we need to save SCC depends on start and end states
+      // Whether we need to save SCC depends on start and end states.
        bool SaveSCC = false;
        switch (State) {
        case StateExact:
        case StateStrictWWM:
-        // Exact/WWM -> WWM: save SCC
-        // Exact/WWM -> WQM: save SCC if WQM mask is generated from exec
-        // Exact/WWM -> Exact: no save
-        SaveSCC =
-            (Needs & StateStrictWWM) || ((Needs & StateWQM) && WQMFromExec);
+      case StateStrictWQM:
+        // Exact/Strict -> Strict: save SCC
+        // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
+        // Exact/Strict -> Exact: no save
+        SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
          break;
        case StateWQM:
-        // WQM -> Exact/WMM: save SCC
+        // WQM -> Exact/Strict: save SCC
          SaveSCC = !(Needs & StateWQM);
          break;
        default:
@@ -1255,20 +1307,25 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
        MachineBasicBlock::iterator Before =
            prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
  
-      if (State == StateStrictWWM) {
-        assert(SavedNonStrictWWMReg);
-        fromStrictWWM(MBB, Before, SavedNonStrictWWMReg, NonStrictWWMState);
-        LIS->createAndComputeVirtRegInterval(SavedNonStrictWWMReg);
-        SavedNonStrictWWMReg = 0;
-        State = NonStrictWWMState;
+      if (State & StateStrict) {
+        assert(State == StateStrictWWM || State == StateStrictWQM);
+        assert(SavedNonStrictReg);
+        fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
+
+        LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
+        SavedNonStrictReg = 0;
+        State = NonStrictState;
        }
  
-      if (Needs == StateStrictWWM) {
-        NonStrictWWMState = State;
-        assert(!SavedNonStrictWWMReg);
-        SavedNonStrictWWMReg = MRI->createVirtualRegister(BoolRC);
-        toStrictWWM(MBB, Before, SavedNonStrictWWMReg);
-        State = StateStrictWWM;
+      if (Needs & StateStrict) {
+        NonStrictState = State;
+        assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
+        assert(!SavedNonStrictReg);
+        SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
+
+        toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
+        State = Needs;
+
        } else {
          if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
            if (!WQMFromExec && (OutNeeds & StateWQM)) {
@@ -1298,10 +1355,10 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
        }
      }
  
-    if (Needs != (StateExact | StateWQM | StateStrictWWM)) {
+    if (Needs != (StateExact | StateWQM | StateStrict)) {
        if (Needs != (StateExact | StateWQM))
          FirstWQM = IE;
-      FirstStrictWWM = IE;
+      FirstStrict = IE;
      }
  
      if (II == IE)
@@ -1310,7 +1367,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
      II = Next;
    }
    assert(!SavedWQMReg);
-  assert(!SavedNonStrictWWMReg);
+  assert(!SavedNonStrictReg);
  }
  
  void SIWholeQuadMode::lowerLiveMaskQueries() {
@@ -1402,6 +1459,10 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
  }
  
  bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
+                    << " ------------- \n");
+  LLVM_DEBUG(MF.dump(););
+
    Instructions.clear();
    Blocks.clear();
    LiveMaskQueries.clear();
@@ -1442,10 +1503,9 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
  
    LiveMaskReg = Exec;
  
-  // Shader is simple does not need WQM/StrictWWM or any complex lowering
-  if (!(GlobalFlags & (StateWQM | StateStrictWWM)) &&
-      LowerToCopyInstrs.empty() && LowerToMovInstrs.empty() &&
-      KillInstrs.empty()) {
+  // Shader is simple does not need any state changes or any complex lowering
+  if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
+      LowerToMovInstrs.empty() && KillInstrs.empty()) {
      lowerLiveMaskQueries();
      return !LiveMaskQueries.empty();
    }
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll

index 23f568f..20c44ab 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -186,13 +186,17 @@ main_body:
  ; Check that we don't leave WWM on for computations that don't require WWM,
  ; since that will lead clobbering things that aren't supposed to be clobbered
  ; in cases like this.
+; We enforce this by checking that v_add gets emitted in the same block as
+; WWM computations.
  ;
  ;CHECK-LABEL: {{^}}test_wwm3:
+;CHECK: %if
  ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
  ;CHECK: buffer_load_dword
  ;CHECK: v_add_f32_e32
  ;CHECK: s_mov_b64 exec, [[ORIG]]
  ;CHECK: v_add_f32_e32
+;CHECK: %endif
  define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
  main_body:
    ; use mbcnt to make sure the branch is divergent
@@ -215,13 +219,17 @@ endif:
  
  ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
  ; write could clobber disabled channels in the non-WWM one.
+; We enforce this by checking that v_mov gets emitted in the same block as
+; WWM computations.
  ;
  ;CHECK-LABEL: {{^}}test_wwm4:
+;CHECK: %if
  ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
  ;CHECK: buffer_load_dword
  ;CHECK: v_add_f32_e32
  ;CHECK: s_mov_b64 exec, [[ORIG]]
  ;CHECK-NEXT: v_mov_b32_e32
+;CHECK: %endif
  define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
  main_body:
    ; use mbcnt to make sure the branch is divergent
@@ -277,6 +285,7 @@ main_body:
  ;VI-CHECK: flat_load_dword
  ;CHECK: v_add_f32_e32
  ;CHECK: s_mov_b64 exec, [[ORIG2]]
+;CHECK: %endif
  define amdgpu_ps float @test_wwm6_then() {
  main_body:
    %src0 = load volatile float, float addrspace(1)* undef
@@ -310,6 +319,7 @@ endif:
  ;SI-CHECK: buffer_load_dword
  ;VI-CHECK: flat_load_dword
  ;CHECK: s_mov_b64 exec, [[ORIG2]]
+;CHECK: %endloop
  define amdgpu_ps float @test_wwm6_loop() {
  main_body:
    %src0 = load volatile float, float addrspace(1)* undef
@@ -352,6 +362,208 @@ main_body:
    ret void
  }
  
+; Check that Strict WQM is triggered by the strict_wqm intrinsic.
+;
+;CHECK-LABEL: {{^}}test_strict_wqm1:
+;CHECK:        s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
+;CHECK:        s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
+  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
+  %out = fadd float %src0, %src1
+  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
+  ret float %out.0
+}
+
+; Same as above, but with an integer type.
+;
+;CHECK-LABEL: {{^}}test_strict_wqm2:
+;CHECK:        s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
+;CHECK:        s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: v_add_{{[iu]}}32_e32
+define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
+  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
+  %src0.0 = bitcast float %src0 to i32
+  %src1.0 = bitcast float %src1 to i32
+  %out = add i32 %src0.0, %src1.0
+  %out.0 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %out)
+  %out.1 = bitcast i32 %out.0 to float
+  ret float %out.1
+}
+
+; Check that we don't leave Strict WQM on for computations that don't require it,
+; since that will lead clobbering things that aren't supposed to be clobbered
+; in cases like this.
+; We enforce this by checking that v_add gets emitted in the same block as
+; WWM computations.
+;
+;CHECK-LABEL: {{^}}test_strict_wqm3:
+;CHECK: %if
+;CHECK:        s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK:        s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK: v_add_f32_e32
+;CHECK: %endif
+define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) {
+main_body:
+  ; use mbcnt to make sure the branch is divergent
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+  %cc = icmp uge i32 %hi, 32
+  br i1 %cc, label %endif, label %if
+
+if:
+  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
+  %out = fadd float %src, %src
+  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
+  %out.1 = fadd float %src, %out.0
+  br label %endif
+
+endif:
+  %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
+  ret float %out.2
+}
+
+; Check that Strict WQM writes aren't coalesced with non-strict writes, since
+; the Strict WQM write could clobber disabled channels in the non-strict one.
+; We enforce this by checking that v_mov gets emitted in the same block as
+; WWM computations.
+;
+;CHECK-LABEL: {{^}}test_strict_wqm4:
+;CHECK: %if
+;CHECK:        s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
+;CHECK:        s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK-NEXT: v_mov_b32_e32
+;CHECK: %endif
+define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) {
+main_body:
+  ; use mbcnt to make sure the branch is divergent
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+  %cc = icmp uge i32 %hi, 32
+  br i1 %cc, label %endif, label %if
+
+if:
+  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
+  %out = fadd float %src, %src
+  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
+  br label %endif
+
+endif:
+  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
+  ret float %out.1
+}
+
+; Make sure the transition from Exact to Strict WQM then WQM works properly.
+;
+;CHECK-LABEL: {{^}}test_strict_wqm5:
+;CHECK: buffer_load_dword
+;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK: buffer_store_dword
+;CHECK:        s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK: s_wqm_b64 exec, exec
+define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
+  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
+  %temp = fadd float %src1, %src1
+  %temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
+  %out = fadd float %temp.0, %temp.0
+  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
+  ret float %out.0
+}
+
+; Check that Strict WQM is turned on correctly across basic block boundaries.
+; if..then..endif version
+;
+;CHECK-LABEL: {{^}}test_strict_wqm6_then:
+;CHECK:        s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK:        s_wqm_b64 exec, exec
+;SI-CHECK: buffer_load_dword
+;VI-CHECK: flat_load_dword
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK: %if
+;CHECK:        s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK:        s_wqm_b64 exec, exec
+;SI-CHECK: buffer_load_dword
+;VI-CHECK: flat_load_dword
+;CHECK: v_add_f32_e32
+;CHECK: s_mov_b64 exec, [[ORIG2]]
+;CHECK: %endif
+define amdgpu_ps float @test_strict_wqm6_then() {
+main_body:
+  %src0 = load volatile float, float addrspace(1)* undef
+  ; use mbcnt to make sure the branch is divergent
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+  %cc = icmp uge i32 %hi, 32
+  br i1 %cc, label %endif, label %if
+
+if:
+  %src1 = load volatile float, float addrspace(1)* undef
+  %out = fadd float %src0, %src1
+  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
+  br label %endif
+
+endif:
+  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
+  ret float %out.1
+}
+
+; Check that Strict WQM is turned on correctly across basic block boundaries.
+; loop version
+;
+;CHECK-LABEL: {{^}}test_strict_wqm6_loop:
+;CHECK:        s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK:        s_wqm_b64 exec, exec
+;SI-CHECK: buffer_load_dword
+;VI-CHECK: flat_load_dword
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK: %loop
+;CHECK:        s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK:        s_wqm_b64 exec, exec
+;SI-CHECK: buffer_load_dword
+;VI-CHECK: flat_load_dword
+;CHECK: s_mov_b64 exec, [[ORIG2]]
+;CHECK: %endloop
+define amdgpu_ps float @test_strict_wqm6_loop() {
+main_body:
+  %src0 = load volatile float, float addrspace(1)* undef
+  ; use mbcnt to make sure the branch is divergent
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+  br label %loop
+
+loop:
+  %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ]
+  %src1 = load volatile float, float addrspace(1)* undef
+  %out = fadd float %src0, %src1
+  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
+  %counter.1 = sub i32 %counter, 1
+  %cc = icmp ne i32 %counter.1, 0
+  br i1 %cc, label %loop, label %endloop
+
+endloop:
+  ret float %out.0
+}
+
  ; Check that enabling WQM anywhere enables WQM for the set.inactive source.
  ;
  ;CHECK-LABEL: {{^}}test_set_inactive2:
@@ -862,13 +1074,17 @@ main_body:
  ; Check that we don't leave WWM on for computations that don't require WWM,
  ; since that will lead clobbering things that aren't supposed to be clobbered
  ; in cases like this.
+; We enforce this by checking that v_add gets emitted in the same block as
+; WWM computations.
  ;
  ;CHECK-LABEL: {{^}}test_strict_wwm3:
+;CHECK: %if
  ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
  ;CHECK: buffer_load_dword
  ;CHECK: v_add_f32_e32
  ;CHECK: s_mov_b64 exec, [[ORIG]]
  ;CHECK: v_add_f32_e32
+;CHECK: %endif
  define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) {
  main_body:
    ; use mbcnt to make sure the branch is divergent
@@ -891,13 +1107,17 @@ endif:
  
  ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
  ; write could clobber disabled channels in the non-WWM one.
+; We enforce this by checking that v_mov gets emitted in the same block as
+; WWM computations.
  ;
  ;CHECK-LABEL: {{^}}test_strict_wwm4:
+;CHECK: %if
  ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
  ;CHECK: buffer_load_dword
  ;CHECK: v_add_f32_e32
  ;CHECK: s_mov_b64 exec, [[ORIG]]
  ;CHECK-NEXT: v_mov_b32_e32
+;CHECK: %endif
  define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) {
  main_body:
    ; use mbcnt to make sure the branch is divergent
@@ -953,6 +1173,7 @@ main_body:
  ;VI-CHECK: flat_load_dword
  ;CHECK: v_add_f32_e32
  ;CHECK: s_mov_b64 exec, [[ORIG2]]
+;CHECK: %endif
  define amdgpu_ps float @test_strict_wwm6_then() {
  main_body:
    %src0 = load volatile float, float addrspace(1)* undef
@@ -986,6 +1207,7 @@ endif:
  ;SI-CHECK: buffer_load_dword
  ;VI-CHECK: flat_load_dword
  ;CHECK: s_mov_b64 exec, [[ORIG2]]
+;CHECK: %endloop
  define amdgpu_ps float @test_strict_wwm6_loop() {
  main_body:
    %src0 = load volatile float, float addrspace(1)* undef
@@ -1059,7 +1281,135 @@ ENDIF:
    ret float %r
  }
  
+; Check a case of a block being entirely WQM except for a bit of STRICT WQM.
+;
+;CHECK-LABEL: {{^}}test_strict_wqm_within_wqm:
+;CHECK: %IF
+;CHECK:        s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
+;CHECK:        s_wqm_b64 exec, exec
+;CHECK: ds_swizzle
+;
+define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
+main_body:
+  %c.bc = bitcast i32 %c to float
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
+  %cmp = icmp eq i32 %z, 0
+  br i1 %cmp, label %IF, label %ENDIF
  
+IF:
+  %dataf = extractelement <4 x float> %dtex, i32 0
+  %data1 = fptosi float %dataf to i32
+  %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079)
+  %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2)
+  %data3f = sitofp i32 %data3 to float
+  br label %ENDIF
+
+ENDIF:
+  %r = phi float [ %data3f, %IF ], [ 0.0, %main_body ]
+  ret float %r
+}
+
+;CHECK-LABEL: {{^}}test_strict_wqm_strict_wwm_wqm:
+;CHECK: buffer_store_dword
+
+;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: s_mov_b64 exec, [[ORIG]]
+
+;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
+;CHECK: buffer_load_dword
+;CHECK: s_mov_b64 exec, [[ORIG2]]
+
+;CHECK: s_mov_b64 [[ORIG3:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: v_add
+;CHECK: s_mov_b64 exec, [[ORIG3]]
+
+;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again.
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: image_sample
+
+define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, <4 x i32> inreg %res2, float %inp, <8 x i32> inreg %res3) {
+main_body:
+  call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
+  %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
+  %temp = fadd float %reload, %reload
+  %temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
+  %temp3 = fadd float %temp2, %temp2
+  %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res2, i32 %idx0, i32 0, i32 0, i32 0)
+  %temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm)
+  %temp5 = fadd float %temp3, %temp4
+  %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res, i1 false, i32 0, i32 0)
+  call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
+  %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
+  ret float %out
+}
+
+;CHECK-LABEL: {{^}}test_strict_wwm_strict_wqm_wqm:
+;CHECK: buffer_store_dword
+
+;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
+;CHECK: buffer_load_dword
+;CHECK: s_mov_b64 exec, [[ORIG]]
+
+;CHECK: s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: s_mov_b64 exec, [[ORIG2]]
+
+;CHECK: s_or_saveexec_b64 [[ORIG3:s\[[0-9]+:[0-9]+\]]], -1
+;CHECK: v_add
+;CHECK: s_mov_b64 exec, [[ORIG3]]
+
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: image_sample
+define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) {
+main_body:
+  call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
+  %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
+  %temp = fadd float %reload, %reload
+  %temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
+  %temp3 = fadd float %temp2, %temp2
+  %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
+  %temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
+  %temp5 = fadd float %temp3, %temp4
+  %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0)
+  call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
+  %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
+  ret float %out
+}
+
+;CHECK-LABEL: {{^}}test_wqm_strict_wqm_wqm:
+;CHECK: buffer_store_dword
+
+;CHECK: s_wqm_b64 exec, exec
+
+;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again.
+;CHECK: s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: s_mov_b64 exec, [[ORIG2]]
+
+;CHECK: image_sample
+
+define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) {
+main_body:
+  call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
+  %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
+  %temp = fadd float %reload, %reload
+  %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0)
+  %temp2 = fadd float %tex, %tex
+  %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
+  %temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
+  %temp4 = fadd float %temp2, %temp3
+  %tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0)
+  call void @llvm.amdgcn.struct.buffer.store.f32(float %tex2, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
+  %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
+  ret float %out
+}
  
  declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
  declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
@@ -1074,6 +1424,7 @@ declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32)
  declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3
  declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
  declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
+declare float @llvm.amdgcn.image.sample.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
  declare void @llvm.amdgcn.kill(i1) #1
  declare float @llvm.amdgcn.wqm.f32(float) #3
  declare i32 @llvm.amdgcn.wqm.i32(i32) #3
@@ -1081,6 +1432,8 @@ declare float @llvm.amdgcn.strict.wwm.f32(float) #3
  declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3
  declare float @llvm.amdgcn.wwm.f32(float) #3
  declare i32 @llvm.amdgcn.wwm.i32(i32) #3
+declare float @llvm.amdgcn.strict.wqm.f32(float) #3
+declare i32 @llvm.amdgcn.strict.wqm.i32(i32) #3
  declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
  declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
  declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
author	Piotr Sobczak <Piotr.Sobczak@amd.com>
	Wed, 3 Mar 2021 09:18:28 +0000 (10:18 +0100)
committer	Piotr Sobczak <Piotr.Sobczak@amd.com>
	Wed, 3 Mar 2021 13:19:16 +0000 (14:19 +0100)
llvm/include/llvm/IR/IntrinsicsAMDGPU.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/wqm.ll		patch \| blob \| history