return SGPRInitBug;
}
+ bool has12DWordStoreHazard() const {
+ return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
+ }
+
unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const;
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_si <0x5b>;
defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_si <0x5c>;
defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_si <0x5d>;
+// FIXME: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on CI.
//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomic_si <0x5e">; // isn't on VI
//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomic_si <0x5f>; // isn't on VI
//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomic_si <0x60>; // isn't on VI
if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
return NoopHazard;
+ if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
+ return NoopHazard;
+
if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
return NoopHazard;
if (SIInstrInfo::isSMRD(*MI))
return std::max(0, checkSMRDHazards(MI));
- if (SIInstrInfo::isVMEM(*MI))
- return std::max(0, checkVMEMHazards(MI));
+ if (SIInstrInfo::isVALU(*MI)) {
+ int WaitStates = std::max(0, checkVALUHazards(MI));
- if (SIInstrInfo::isDPP(*MI))
- return std::max(0, checkDPPHazards(MI));
+ if (SIInstrInfo::isVMEM(*MI))
+ WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
- if (isDivFMas(MI->getOpcode()))
- return std::max(0, checkDivFMasHazards(MI));
+ if (SIInstrInfo::isDPP(*MI))
+ WaitStates = std::max(WaitStates, checkDPPHazards(MI));
+
+ if (isDivFMas(MI->getOpcode()))
+ WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
+
+ return WaitStates;
+ }
if (isSGetReg(MI->getOpcode()))
return std::max(0, checkGetRegHazards(MI));
// Helper Functions
//===----------------------------------------------------------------------===//
-int GCNHazardRecognizer::getWaitStatesSinceDef(
- unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) {
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
+int GCNHazardRecognizer::getWaitStatesSince(
+ function_ref<bool(MachineInstr *)> IsHazard) {
int WaitStates = -1;
for (MachineInstr *MI : EmittedInstrs) {
++WaitStates;
- if (!MI || !IsHazardDef(MI))
+ if (!MI || !IsHazard(MI))
continue;
- if (MI->modifiesRegister(Reg, TRI))
- return WaitStates;
+ return WaitStates;
}
return std::numeric_limits<int>::max();
}
+int GCNHazardRecognizer::getWaitStatesSinceDef(
+ unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) {
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
+ return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
+ };
+
+ return getWaitStatesSince(IsHazardFn);
+}
+
int GCNHazardRecognizer::getWaitStatesSinceSetReg(
function_ref<bool(MachineInstr *)> IsHazard) {
- int WaitStates = -1;
- for (MachineInstr *MI : EmittedInstrs) {
- ++WaitStates;
- if (!MI || !isSSetReg(MI->getOpcode()) || !IsHazard(MI))
- continue;
- return WaitStates;
- }
- return std::numeric_limits<int>::max();
+ auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
+ return isSSetReg(MI->getOpcode()) && IsHazard(MI);
+ };
+
+ return getWaitStatesSince(IsHazardFn);
}
//===----------------------------------------------------------------------===//
int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
return SetRegWaitStates - WaitStatesNeeded;
}
+
+int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
+ if (!MI.mayStore())
+ return -1;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MI.getDesc();
+
+ int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
+ int VDataRCID = -1;
+ if (VDataIdx != -1)
+ VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
+
+ if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
+ // For MUBUF/MTBUF instructions this hazard only exists if the
+ // instruction is not using a register in the soffset field.
+ const MachineOperand *SOffset =
+ TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
+ // If we have no soffset operand, then assume this field has been
+ // hardcoded to zero.
+ if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
+ (!SOffset || !SOffset->isReg()))
+ return VDataIdx;
+ }
+
+ // MIMG instructions create a hazard if they don't use a 256-bit T# and
+ // the store size is greater than 8 bytes and they have more than two bits
+ // of their dmask set.
+ // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
+ if (TII->isMIMG(MI)) {
+ int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
+ assert(SRsrcIdx != -1 &&
+ AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
+ }
+
+ if (TII->isFLAT(MI)) {
+ int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::data);
+ if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
+ return DataIdx;
+ }
+
+ return -1;
+}
+
+int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
+ // This checks for the hazard where VMEM instructions that store more than
+ // 8 bytes can have there store data over written by the next instruction.
+ if (!ST.has12DWordStoreHazard())
+ return 0;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const MachineRegisterInfo &MRI = VALU->getParent()->getParent()->getRegInfo();
+
+ const int VALUWaitStates = 1;
+ int WaitStatesNeeded = 0;
+
+ for (const MachineOperand &Def : VALU->defs()) {
+ if (!TRI->isVGPR(MRI, Def.getReg()))
+ continue;
+ unsigned Reg = Def.getReg();
+ auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
+ int DataIdx = createsVALUHazard(*MI);
+ return DataIdx >= 0 &&
+ TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
+ };
+ int WaitStatesNeededForDef =
+ VALUWaitStates - getWaitStatesSince(IsHazardFn);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+ }
+ return WaitStatesNeeded;
+}
const MachineFunction &MF;
const SISubtarget &ST;
+ int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard);
int getWaitStatesSinceDef(unsigned Reg,
function_ref<bool(MachineInstr *)> IsHazardDef =
[](MachineInstr *) { return true; });
int checkDivFMasHazards(MachineInstr *DivFMas);
int checkGetRegHazards(MachineInstr *GetRegInstr);
int checkSetRegHazards(MachineInstr *SetRegInstr);
+ int createsVALUHazard(const MachineInstr &MI);
+ int checkVALUHazards(MachineInstr *VALU);
public:
GCNHazardRecognizer(const MachineFunction &MF);
// We can only issue one instruction per cycle.
// Avoid using MCRegisterClass::getSize, since that function will go away
// (move from MC* level to Target* level). Return size in bits.
-unsigned getRegBitWidth(const MCRegisterClass &RC) {
- switch (RC.getID()) {
+unsigned getRegBitWidth(unsigned RCID) {
+ switch (RCID) {
case AMDGPU::SGPR_32RegClassID:
case AMDGPU::VGPR_32RegClassID:
case AMDGPU::VS_32RegClassID:
}
}
+unsigned getRegBitWidth(const MCRegisterClass &RC) {
+ return getRegBitWidth(RC.getID());
+}
+
unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
unsigned OpNo) {
unsigned RCID = Desc.OpInfo[OpNo].RegClass;
bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo);
/// \brief Get the size in bits of a register from the register class \p RC.
+unsigned getRegBitWidth(unsigned RCID);
+
+/// \brief Get the size in bits of a register from the register class \p RC.
unsigned getRegBitWidth(const MCRegisterClass &RC);
/// \brief Get size of register operand
# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN
-# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN
-# RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,VI
+# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI
--- |
define void @div_fmas() { ret void }
define void @s_getreg() { ret void }
define void @s_setreg() { ret void }
+ define void @vmem_gt_8dw_store() { ret void }
...
---
# GCN-LABEL: name: div_fmas
S_SETREG_B32 %sgpr1, 0
S_ENDPGM
...
+
+...
+---
+# GCN-LABEL: name: vmem_gt_8dw_store
+
+# GCN-LABEL: bb.0:
+# GCN: BUFFER_STORE_DWORD_OFFSET
+# GCN-NEXT: V_MOV_B32
+# GCN: BUFFER_STORE_DWORDX3_OFFSET
+# CIVI: S_NOP
+# GCN-NEXT: V_MOV_B32
+# GCN: BUFFER_STORE_DWORDX4_OFFSET
+# GCN-NEXT: V_MOV_B32
+# GCN: BUFFER_STORE_DWORDX4_OFFSET
+# CIVI: S_NOP
+# GCN-NEXT: V_MOV_B32
+# GCN: BUFFER_STORE_FORMAT_XYZ_OFFSET
+# CIVI: S_NOP
+# GCN-NEXT: V_MOV_B32
+# GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET
+# CIVI: S_NOP
+# GCN-NEXT: V_MOV_B32
+
+# GCN-LABEL: bb.1:
+# GCN: FLAT_STORE_DWORDX2
+# GCN-NEXT: V_MOV_B32
+# GCN: FLAT_STORE_DWORDX3
+# CIVI: S_NOP
+# GCN-NEXT: V_MOV_B32
+# GCN: FLAT_STORE_DWORDX4
+# CIVI: S_NOP
+# GCN-NEXT: V_MOV_B32
+# GCN: FLAT_ATOMIC_CMPSWAP_X2
+# CIVI: S_NOP
+# GCN-NEXT: V_MOV_B32
+# GCN: FLAT_ATOMIC_FCMPSWAP_X2
+# CIVI: S_NOP
+# GCN: V_MOV_B32
+
+name: vmem_gt_8dw_store
+
+body: |
+ bb.0:
+ successors: %bb.1
+ BUFFER_STORE_DWORD_OFFSET %vgpr3, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ %vgpr3 = V_MOV_B32_e32 0, implicit %exec
+ BUFFER_STORE_DWORDX3_OFFSET %vgpr2_vgpr3_vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec
+ %vgpr3 = V_MOV_B32_e32 0, implicit %exec
+ BUFFER_STORE_DWORDX4_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ %vgpr3 = V_MOV_B32_e32 0, implicit %exec
+ BUFFER_STORE_DWORDX4_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec
+ %vgpr3 = V_MOV_B32_e32 0, implicit %exec
+ BUFFER_STORE_FORMAT_XYZ_OFFSET %vgpr2_vgpr3_vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec
+ %vgpr3 = V_MOV_B32_e32 0, implicit %exec
+ BUFFER_STORE_FORMAT_XYZW_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec
+ %vgpr3 = V_MOV_B32_e32 0, implicit %exec
+ BUFFER_ATOMIC_CMPSWAP_X2_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit %exec
+ %vgpr3 = V_MOV_B32_e32 0, implicit %exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr3 = V_MOV_B32_e32 0, implicit %exec
+ FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr3 = V_MOV_B32_e32 0, implicit %exec
+ FLAT_STORE_DWORDX4 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr3 = V_MOV_B32_e32 0, implicit %exec
+ FLAT_ATOMIC_CMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr3 = V_MOV_B32_e32 0, implicit %exec
+ FLAT_ATOMIC_FCMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr3 = V_MOV_B32_e32 0, implicit %exec
+ S_ENDPGM
+
+...