continue;
}
-
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
// We cannot move this instruction to the VALU, so we should try to
// legalize its operands instead.
continue;
}
+ // Handle converting generic instructions like COPY-to-SGPR into
+ // COPY-to-VGPR.
+ if (NewOpcode == Opcode) {
+ Register DstReg = Inst.getOperand(0).getReg();
+ const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
+
+ if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
+ NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
+ // Instead of creating a copy where src and dst are the same register
+ // class, we just replace all uses of dst with src. These kinds of
+ // copies interfere with the heuristics MachineSink uses to decide
+ // whether or not to split a critical edge. Since the pass assumes
+ // that copies will end up as machine instructions and not be
+ // eliminated.
+ addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+ MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
+ MRI.clearKillFlags(Inst.getOperand(1).getReg());
+ Inst.getOperand(0).setReg(DstReg);
+
+ // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
+ // these are deleted later, but at -O0 it would leave a suspicious
+ // looking illegal copy of an undef register.
+ for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
+ Inst.removeOperand(I);
+ Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
+ continue;
+ }
+
+ Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
+ MRI.replaceRegWith(DstReg, NewDstReg);
+ legalizeOperands(Inst, MDT);
+ addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
+ continue;
+ }
+
// Use the new VALU Opcode.
- const MCInstrDesc &NewDesc = get(NewOpcode);
- Inst.setDesc(NewDesc);
+ auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
+ .setMIFlags(Inst.getFlags());
+ for (const MachineOperand &Op : Inst.explicit_operands())
+ NewInstr->addOperand(Op);
// Remove any references to SCC. Vector instructions can't read from it, and
// We're just about to add the implicit use / defs of VCC, and we don't want
// both.
- for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
- MachineOperand &Op = Inst.getOperand(i);
- if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
+ for (MachineOperand &Op : Inst.implicit_operands()) {
+ if (Op.getReg() == AMDGPU::SCC) {
// Only propagate through live-def of SCC.
if (Op.isDef() && !Op.isDead())
addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
if (Op.isUse())
- addSCCDefsToVALUWorklist(Op, Worklist);
- Inst.removeOperand(i);
+ addSCCDefsToVALUWorklist(NewInstr, Worklist);
}
}
+ Inst.eraseFromParent();
+
+ Register NewDstReg;
+ if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
+ Register DstReg = NewInstr->getOperand(0).getReg();
+ assert(DstReg.isVirtual());
+
+ // Update the destination register class.
+ const TargetRegisterClass *NewDstRC =
+ getDestEquivalentVGPRClass(*NewInstr);
+ assert(NewDstRC);
+
+ NewDstReg = MRI.createVirtualRegister(NewDstRC);
+ MRI.replaceRegWith(DstReg, NewDstReg);
+ }
+
if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
// We are converting these to a BFE, so we need to add the missing
// operands for the size and offset.
unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
- Inst.addOperand(MachineOperand::CreateImm(0));
- Inst.addOperand(MachineOperand::CreateImm(Size));
-
+ NewInstr.addImm(0);
+ NewInstr.addImm(Size);
} else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
// The VALU version adds the second operand to the result, so insert an
// extra 0 operand.
- Inst.addOperand(MachineOperand::CreateImm(0));
+ NewInstr.addImm(0);
}
- Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
- fixImplicitOperands(Inst);
-
if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
- const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
+ const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2);
// If we need to move this to VGPRs, we need to unpack the second operand
// back into the 2 separate ones for bit offset and width.
assert(OffsetWidthOp.isImm() &&
uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
- Inst.removeOperand(2); // Remove old immediate.
- Inst.addOperand(MachineOperand::CreateImm(Offset));
- Inst.addOperand(MachineOperand::CreateImm(BitWidth));
+ NewInstr->removeOperand(2);
+ NewInstr.addImm(Offset);
+ NewInstr.addImm(BitWidth);
}
- bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
- Register NewDstReg;
- if (HasDst) {
- Register DstReg = Inst.getOperand(0).getReg();
- if (DstReg.isPhysical())
- continue;
-
- // Update the destination register class.
- const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
- if (!NewDstRC)
- continue;
-
- if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
- NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
- // Instead of creating a copy where src and dst are the same register
- // class, we just replace all uses of dst with src. These kinds of
- // copies interfere with the heuristics MachineSink uses to decide
- // whether or not to split a critical edge. Since the pass assumes
- // that copies will end up as machine instructions and not be
- // eliminated.
- addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
- MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
- MRI.clearKillFlags(Inst.getOperand(1).getReg());
- Inst.getOperand(0).setReg(DstReg);
-
- // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
- // these are deleted later, but at -O0 it would leave a suspicious
- // looking illegal copy of an undef register.
- for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
- Inst.removeOperand(I);
- Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
- continue;
- }
-
- NewDstReg = MRI.createVirtualRegister(NewDstRC);
- MRI.replaceRegWith(DstReg, NewDstReg);
- }
+ fixImplicitOperands(*NewInstr);
// Legalize the operands
- CreatedBBTmp = legalizeOperands(Inst, MDT);
+ CreatedBBTmp = legalizeOperands(*NewInstr, MDT);
if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
CreatedBB = CreatedBBTmp;
- if (HasDst)
- addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
+ if (NewDstReg)
+ addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
}
return CreatedBB;
}
// SCC must be changed to an instruction that defines VCC. This function makes
// sure that the instruction that defines SCC is added to the moveToVALU
// worklist.
-void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op,
+void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
SetVectorType &Worklist) const {
- assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse());
-
- MachineInstr *SCCUseInst = Op.getParent();
// Look for a preceding instruction that either defines VCC or SCC. If VCC
// then there is nothing to do because the defining instruction has been
// converted to a VALU already. If SCC then that instruction needs to be
MachineInstr &SCCDefInst,
SetVectorType &Worklist,
Register NewCond = Register()) const;
- void addSCCDefsToVALUWorklist(MachineOperand &Op,
+ void addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
SetVectorType &Worklist) const;
const TargetRegisterClass *
; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_add_f32_e32 v0, v0, v1
-; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; CHECK-NEXT: ; return to shader part epilog
main_body:
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_add_f32_e32 v0, v0, v1
-; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; CHECK-NEXT: ; return to shader part epilog
main_body:
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
; CHECK-NEXT: v_add_f32_e32 v1, v1, v2
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
; CHECK-NEXT: v_add_f32_e32 v0, v1, v1
-; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ; return to shader part epilog
main_body:
; CHECK-NEXT: s_mov_b64 exec, s[2:3]
; CHECK-NEXT: v_mov_b32_e32 v0, v1
; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
-; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; CHECK-NEXT: ; return to shader part epilog
main_body:
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
; CHECK-NEXT: s_mov_b64 exec, s[2:3]
; CHECK-NEXT: v_mov_b32_e32 v0, v1
; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
-; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; CHECK-NEXT: ; return to shader part epilog
main_body:
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_add_f32_e32 v2, v0, v1
-; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec
+; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
; CHECK-NEXT: .LBB6_4: ; %END
; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
; CHECK-NEXT: v_mov_b32_e32 v0, v2
; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_add_f32_e32 v2, v0, v1
-; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec
+; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
; CHECK-NEXT: .LBB7_4: ; %END
; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
; CHECK-NEXT: s_and_b64 exec, exec, s[14:15]
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
-; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
-; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-W32-NEXT: ; return to shader part epilog
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
-; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
-; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-W32-NEXT: ; return to shader part epilog
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $scc killed $exec
-; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec
+; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_u32_e32 v1, v2, v1
; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $scc killed $exec
-; GFX10-W32-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec killed $exec
+; GFX10-W32-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
-; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
-; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
+; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-W32-NEXT: ; return to shader part epilog