if (SIInstrInfo::isMFMA(*MI))
return 0;
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
int WaitStatesNeeded = 0;
- bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
- SIInstrInfo::isFLAT(*MI) ||
- SIInstrInfo::isDS(*MI) ||
- SIInstrInfo::isEXP(*MI);
+ bool IsMem = SIInstrInfo::isVMEM(*MI) ||
+ SIInstrInfo::isFLAT(*MI) ||
+ SIInstrInfo::isDS(*MI);
+ bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
bool IsVALU = SIInstrInfo::isVALU(*MI);
const MachineInstr *MFMA = nullptr;
return true;
};
+ bool DGEMMAfterVALUWrite = false;
+ auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
+ // Found DGEMM on reverse traversal to def.
+ if (isDGEMM(MI.getOpcode()))
+ DGEMMAfterVALUWrite = true;
+
+ // Only hazard if register is defined by a VALU and a DGEMM is found after
+ // after the def.
+ if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
+ return false;
+
+ return true;
+ };
+
int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
AMDGPU::OpName::src2);
const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
const int DotWriteSameDotReadSrcAB = 3;
const int DotWriteDifferentVALURead = 3;
+ const int DMFMABetweenVALUWriteVMEMRead = 2;
const int MaxWaitStates = 19;
for (const MachineOperand &Use : MI->explicit_uses()) {
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
+ // Workaround for HW data hazard bug observed only in GFX90A. When there
+ // is a DGEMM instruction in-between a VALU and a VMEM instruction it
+ // causes the SQ to incorrectly not insert two wait states between the two
+ // instructions needed to avoid data hazard.
+ if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
+ DGEMMAfterVALUWrite = false;
+ if (TRI.isVectorRegister(MRI, Reg)) {
+ int WaitStatesNeededForUse =
+ DMFMABetweenVALUWriteVMEMRead -
+ getWaitStatesSinceDef(Reg, IsDGEMMHazard,
+ DMFMABetweenVALUWriteVMEMRead);
+
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+ }
+ }
+
MFMA = nullptr;
WaitStatesSinceDef =
getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
$agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
$vgpr4_vgpr5 = V_FMAC_F64_e32 $vgpr4_vgpr5, $vgpr4_vgpr5, $vgpr4_vgpr5, implicit $mode, implicit $exec
...
+# GCN-LABEL: name: dgemm_between_valu_write_buffer_store
+# GCN: V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: BUFFER_STORE_DWORD
+name: dgemm_between_valu_write_buffer_store
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+ BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: dgemm_between_valu_write_buffer_load
+# GCN: V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: BUFFER_LOAD_DWORD
+name: dgemm_between_valu_write_buffer_load
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr1 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: dgemm_between_valu_write_global_store
+# GCN: V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: GLOBAL_STORE_DWORD
+
+name: dgemm_between_valu_write_global_store
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+ GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: dgemm_between_valu_write_global_load
+# GCN: V_MOV_B32_e32
+# GCN-NEXT: V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: GLOBAL_LOAD_DWORD
+name: dgemm_between_valu_write_global_load
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: dgemm_between_valu_write_ds_write
+# GCN: V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: DS_WRITE_B32
+name: dgemm_between_valu_write_ds_write
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+ DS_WRITE_B32 $vgpr1, $vgpr0, 0, 0, implicit $m0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm_between_valu_write_ds_read
+# GCN: V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: DS_READ_B32_gfx9
+name: dgemm_between_valu_write_ds_read
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr1 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: dgemm_between_valu_write_flat_store
+# GCN: V_MOV_B32_e32
+# GCN-NEXT: V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: FLAT_STORE_DWORD
+name: dgemm_between_valu_write_flat_store
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+ FLAT_STORE_DWORD $vgpr0_vgpr1, $agpr2, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dgemm_between_valu_write_flat_load
+# GCN: V_MOV_B32_e32
+# GCN-NEXT: V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: FLAT_LOAD_DWORD
+name: dgemm_between_valu_write_flat_load
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dgemm_between_valu_write_scratch_store
+# GCN: V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: SCRATCH_STORE_DWORD
+name: dgemm_between_valu_write_scratch_store
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+ SCRATCH_STORE_DWORD $vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dgemm_between_valu_write_scratch_load
+# GCN: V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: SCRATCH_LOAD_DWORD
+name: dgemm_between_valu_write_scratch_load
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr1 = SCRATCH_LOAD_DWORD undef $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dgemm_between_valu_write_buffer_store_fallthrough1
+# GCN: V_MOV_B32_e32
+# GCN-NEXT: V_MFMA
+# GCN: bb.1:
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_STORE_DWORD
+name: dgemm_between_valu_write_buffer_store_fallthrough1
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+
+ bb.1:
+ BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: dgemm_between_valu_write_buffer_store_fallthrough2
+# GCN: V_MOV_B32_e32
+# GCN: bb.1:
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_STORE_DWORD
+name: dgemm_between_valu_write_buffer_store_fallthrough2
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+
+ bb.1:
+ $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+ BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: dgemm_between_valu_write_buffer_store_fallthrough3
+# GCN: V_MOV_B32_e32
+# GCN: bb.1:
+# GCN: bb.2:
+# GCN-NEXT: V_MFMA
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_STORE_DWORD
+name: dgemm_between_valu_write_buffer_store_fallthrough3
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+
+ bb.1:
+
+ bb.2:
+ $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+ BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+...