[AMDGPU] Create new instructions in SIInstrInfo::moveToVALU

author Jay Foad <jay.foad@amd.com>

Mon, 31 Oct 2022 15:10:14 +0000 (15:10 +0000)

committer Jay Foad <jay.foad@amd.com>

Fri, 4 Nov 2022 07:21:11 +0000 (07:21 +0000)
author Jay Foad <jay.foad@amd.com>
Mon, 31 Oct 2022 15:10:14 +0000 (15:10 +0000)
committer Jay Foad <jay.foad@amd.com>
Fri, 4 Nov 2022 07:21:11 +0000 (07:21 +0000)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

index 27e0d56..775da47 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6362,7 +6362,6 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
        continue;
      }
  
-
      if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
        // We cannot move this instruction to the VALU, so we should try to
        // legalize its operands instead.
@@ -6372,43 +6371,90 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
        continue;
      }
  
+    // Handle converting generic instructions like COPY-to-SGPR into
+    // COPY-to-VGPR.
+    if (NewOpcode == Opcode) {
+      Register DstReg = Inst.getOperand(0).getReg();
+      const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
+
+      if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
+          NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
+        // Instead of creating a copy where src and dst are the same register
+        // class, we just replace all uses of dst with src.  These kinds of
+        // copies interfere with the heuristics MachineSink uses to decide
+        // whether or not to split a critical edge.  Since the pass assumes
+        // that copies will end up as machine instructions and not be
+        // eliminated.
+        addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+        MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
+        MRI.clearKillFlags(Inst.getOperand(1).getReg());
+        Inst.getOperand(0).setReg(DstReg);
+
+        // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
+        // these are deleted later, but at -O0 it would leave a suspicious
+        // looking illegal copy of an undef register.
+        for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
+          Inst.removeOperand(I);
+        Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
+        continue;
+      }
+
+      Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
+      MRI.replaceRegWith(DstReg, NewDstReg);
+      legalizeOperands(Inst, MDT);
+      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
+      continue;
+    }
+
      // Use the new VALU Opcode.
-    const MCInstrDesc &NewDesc = get(NewOpcode);
-    Inst.setDesc(NewDesc);
+    auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
+                        .setMIFlags(Inst.getFlags());
+    for (const MachineOperand &Op : Inst.explicit_operands())
+      NewInstr->addOperand(Op);
  
      // Remove any references to SCC. Vector instructions can't read from it, and
      // We're just about to add the implicit use / defs of VCC, and we don't want
      // both.
-    for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
-      MachineOperand &Op = Inst.getOperand(i);
-      if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
+    for (MachineOperand &Op : Inst.implicit_operands()) {
+      if (Op.getReg() == AMDGPU::SCC) {
          // Only propagate through live-def of SCC.
          if (Op.isDef() && !Op.isDead())
            addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
          if (Op.isUse())
-          addSCCDefsToVALUWorklist(Op, Worklist);
-        Inst.removeOperand(i);
+          addSCCDefsToVALUWorklist(NewInstr, Worklist);
        }
      }
  
+    Inst.eraseFromParent();
+
+    Register NewDstReg;
+    if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
+      Register DstReg = NewInstr->getOperand(0).getReg();
+      assert(DstReg.isVirtual());
+
+      // Update the destination register class.
+      const TargetRegisterClass *NewDstRC =
+          getDestEquivalentVGPRClass(*NewInstr);
+      assert(NewDstRC);
+
+      NewDstReg = MRI.createVirtualRegister(NewDstRC);
+      MRI.replaceRegWith(DstReg, NewDstReg);
+    }
+
      if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
        // We are converting these to a BFE, so we need to add the missing
        // operands for the size and offset.
        unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
-      Inst.addOperand(MachineOperand::CreateImm(0));
-      Inst.addOperand(MachineOperand::CreateImm(Size));
-
+      NewInstr.addImm(0);
+      NewInstr.addImm(Size);
      } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
        // The VALU version adds the second operand to the result, so insert an
        // extra 0 operand.
-      Inst.addOperand(MachineOperand::CreateImm(0));
+      NewInstr.addImm(0);
      }
  
-    Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
-    fixImplicitOperands(Inst);
-
      if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
-      const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
+      const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2);
        // If we need to move this to VGPRs, we need to unpack the second operand
        // back into the 2 separate ones for bit offset and width.
        assert(OffsetWidthOp.isImm() &&
@@ -6417,56 +6463,20 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
  
        uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
        uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
-      Inst.removeOperand(2);                     // Remove old immediate.
-      Inst.addOperand(MachineOperand::CreateImm(Offset));
-      Inst.addOperand(MachineOperand::CreateImm(BitWidth));
+      NewInstr->removeOperand(2);
+      NewInstr.addImm(Offset);
+      NewInstr.addImm(BitWidth);
      }
  
-    bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
-    Register NewDstReg;
-    if (HasDst) {
-      Register DstReg = Inst.getOperand(0).getReg();
-      if (DstReg.isPhysical())
-        continue;
-
-      // Update the destination register class.
-      const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
-      if (!NewDstRC)
-        continue;
-
-      if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
-          NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
-        // Instead of creating a copy where src and dst are the same register
-        // class, we just replace all uses of dst with src.  These kinds of
-        // copies interfere with the heuristics MachineSink uses to decide
-        // whether or not to split a critical edge.  Since the pass assumes
-        // that copies will end up as machine instructions and not be
-        // eliminated.
-        addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
-        MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
-        MRI.clearKillFlags(Inst.getOperand(1).getReg());
-        Inst.getOperand(0).setReg(DstReg);
-
-        // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
-        // these are deleted later, but at -O0 it would leave a suspicious
-        // looking illegal copy of an undef register.
-        for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
-          Inst.removeOperand(I);
-        Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
-        continue;
-      }
-
-      NewDstReg = MRI.createVirtualRegister(NewDstRC);
-      MRI.replaceRegWith(DstReg, NewDstReg);
-    }
+    fixImplicitOperands(*NewInstr);
  
      // Legalize the operands
-    CreatedBBTmp = legalizeOperands(Inst, MDT);
+    CreatedBBTmp = legalizeOperands(*NewInstr, MDT);
      if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
        CreatedBB = CreatedBBTmp;
  
-    if (HasDst)
-     addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
+    if (NewDstReg)
+      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
    }
    return CreatedBB;
  }
@@ -7229,11 +7239,8 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
  // SCC must be changed to an instruction that defines VCC. This function makes
  // sure that the instruction that defines SCC is added to the moveToVALU
  // worklist.
-void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op,
+void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
                                             SetVectorType &Worklist) const {
-  assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse());
-
-  MachineInstr *SCCUseInst = Op.getParent();
    // Look for a preceding instruction that either defines VCC or SCC. If VCC
    // then there is nothing to do because the defining instruction has been
    // converted to a VALU already. If SCC then that instruction needs to be
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h

index 0558901..bf4330e 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -130,7 +130,7 @@ private:
                                      MachineInstr &SCCDefInst,
                                      SetVectorType &Worklist,
                                      Register NewCond = Register()) const;
-  void addSCCDefsToVALUWorklist(MachineOperand &Op,
+  void addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
                                  SetVectorType &Worklist) const;
  
    const TargetRegisterClass *
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll

index c04ab31..e6b6bd6 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
@@ -12,7 +12,7 @@ define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) {
  ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
  ; CHECK-NEXT:    s_waitcnt vmcnt(0)
  ; CHECK-NEXT:    v_add_f32_e32 v0, v0, v1
-; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
  ; CHECK-NEXT:    ; return to shader part epilog
  main_body:
    %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
@@ -33,7 +33,7 @@ define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) {
  ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
  ; CHECK-NEXT:    s_waitcnt vmcnt(0)
  ; CHECK-NEXT:    v_add_f32_e32 v0, v0, v1
-; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
  ; CHECK-NEXT:    ; return to shader part epilog
  main_body:
    %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
@@ -58,7 +58,7 @@ define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) {
  ; CHECK-NEXT:    v_add_f32_e32 v1, v1, v2
  ; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
  ; CHECK-NEXT:    v_add_f32_e32 v0, v1, v1
-; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
  ; CHECK-NEXT:    s_waitcnt vmcnt(0)
  ; CHECK-NEXT:    ; return to shader part epilog
  main_body:
@@ -124,7 +124,7 @@ define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
  ; CHECK-NEXT:    s_mov_b64 exec, s[2:3]
  ; CHECK-NEXT:    v_mov_b32_e32 v0, v1
  ; CHECK-NEXT:    v_add_f32_e32 v0, v0, v0
-; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
  ; CHECK-NEXT:    ; return to shader part epilog
  main_body:
    %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
@@ -156,7 +156,7 @@ define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
  ; CHECK-NEXT:    s_mov_b64 exec, s[2:3]
  ; CHECK-NEXT:    v_mov_b32_e32 v0, v1
  ; CHECK-NEXT:    v_add_f32_e32 v0, v0, v0
-; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
  ; CHECK-NEXT:    ; return to shader part epilog
  main_body:
    %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
@@ -191,7 +191,7 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr
  ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
  ; CHECK-NEXT:    s_waitcnt vmcnt(0)
  ; CHECK-NEXT:    v_add_f32_e32 v2, v0, v1
-; CHECK-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec
+; CHECK-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
  ; CHECK-NEXT:  .LBB6_4: ; %END
  ; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
  ; CHECK-NEXT:    v_mov_b32_e32 v0, v2
@@ -246,7 +246,7 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr
  ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
  ; CHECK-NEXT:    s_waitcnt vmcnt(0)
  ; CHECK-NEXT:    v_add_f32_e32 v2, v0, v1
-; CHECK-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec
+; CHECK-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
  ; CHECK-NEXT:  .LBB7_4: ; %END
  ; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
  ; CHECK-NEXT:    s_and_b64 exec, exec, s[14:15]
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll

index 2167a5a..7e612f5 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -204,7 +204,7 @@ define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
  ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
  ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
  ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
  ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
  ; GFX9-W64-NEXT:    ; return to shader part epilog
  ;
@@ -219,7 +219,7 @@ define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
  ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
  ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
  ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
  ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
  ; GFX10-W32-NEXT:    ; return to shader part epilog
  main_body:
@@ -243,7 +243,7 @@ define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
  ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
  ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
  ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
  ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
  ; GFX9-W64-NEXT:    ; return to shader part epilog
  ;
@@ -258,7 +258,7 @@ define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
  ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
  ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
  ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
  ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
  ; GFX10-W32-NEXT:    ; return to shader part epilog
  main_body:
@@ -496,7 +496,7 @@ define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
  ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
  ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
  ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
-; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
  ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
  ; GFX9-W64-NEXT:    ; return to shader part epilog
  ;
@@ -518,7 +518,7 @@ define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
  ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
  ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
  ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
-; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
  ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
  ; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
  ; GFX10-W32-NEXT:    ; return to shader part epilog
@@ -962,7 +962,7 @@ define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) {
  ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
  ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
  ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
-; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
  ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
  ; GFX9-W64-NEXT:    ; return to shader part epilog
  ;
@@ -986,7 +986,7 @@ define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) {
  ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
  ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
  ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
-; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
  ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
  ; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
  ; GFX10-W32-NEXT:    ; return to shader part epilog
@@ -1176,7 +1176,7 @@ define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
  ; GFX9-W64-NEXT:    s_nop 0
  ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
  ; GFX9-W64-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $scc killed $exec
-; GFX9-W64-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec
+; GFX9-W64-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
  ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
  ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
  ; GFX9-W64-NEXT:    v_add_u32_e32 v1, v2, v1
@@ -1193,7 +1193,7 @@ define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
  ; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
  ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
  ; GFX10-W32-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $scc killed $exec
-; GFX10-W32-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $exec killed $exec
+; GFX10-W32-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $exec
  ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
  ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
  ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
@@ -2500,7 +2500,7 @@ define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
  ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
  ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
  ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
-; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
  ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
  ; GFX9-W64-NEXT:    ; return to shader part epilog
  ;
@@ -2522,7 +2522,7 @@ define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
  ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
  ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
  ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
-; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
  ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
  ; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
  ; GFX10-W32-NEXT:    ; return to shader part epilog
author	Jay Foad <jay.foad@amd.com>
	Mon, 31 Oct 2022 15:10:14 +0000 (15:10 +0000)
committer	Jay Foad <jay.foad@amd.com>
	Fri, 4 Nov 2022 07:21:11 +0000 (07:21 +0000)
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstrInfo.h		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/wqm.ll		patch \| blob \| history