VMEM_BVH
};
+static bool updateVMCntOnly(const MachineInstr &Inst) {
+ return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
+ SIInstrInfo::isFLATScratch(Inst);
+}
+
VmemType getVmemType(const MachineInstr &Inst) {
- assert(SIInstrInfo::isVMEM(Inst));
+ assert(updateVMCntOnly(Inst));
if (!SIInstrInfo::isMIMG(Inst))
return VMEM_NOSAMPLER;
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
if (T == VM_CNT) {
if (Interval.first >= NUM_ALL_VGPRS)
continue;
- if (SIInstrInfo::isVMEM(Inst)) {
+ if (updateVMCntOnly(Inst)) {
VmemType V = getVmemType(Inst);
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
VgprVmemTypes[RegNo] |= 1 << V;
// previous write and this write are the same type of VMEM
// instruction, in which case they're guaranteed to write their
// results in order anyway.
- if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
+ if (Op.isUse() || !updateVMCntOnly(MI) ||
ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
getVmemType(MI))) {
ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
; GFX9V3: ; %bb.0:
; GFX9V3-NEXT: v_mov_b32_e32 v2, 0
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc
-; GFX9V3-NEXT: s_waitcnt vmcnt(0)
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
-; GFX9V3-NEXT: s_waitcnt vmcnt(0)
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc
; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: v_mov_b32_e32 v2, 0
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[6:7] glc
-; GFX9V4-NEXT: s_waitcnt vmcnt(0)
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
-; GFX9V4-NEXT: s_waitcnt vmcnt(0)
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[4:5] glc
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
; GFX9V5-NEXT: v_mov_b32_e32 v2, 0
; GFX9V5-NEXT: global_load_ubyte v0, v[0:1], off glc
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9V5-NEXT: s_waitcnt vmcnt(0)
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc
-; GFX9V5-NEXT: s_waitcnt vmcnt(0)
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
; GFX9V5-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: .LBB3_2:
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
; GFX9-NEXT: .LBB3_3: ; %T
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc
; GFX9-NEXT: .LBB4_2:
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
; GFX9-NEXT: .LBB4_3: ; %T
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc
; GFX9-NEXT: .LBB5_2:
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
; GFX9-NEXT: .LBB5_3: ; %T
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc
; GFX9V3: ; %bb.0:
; GFX9V3-NEXT: v_mov_b32_e32 v2, 0
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc
-; GFX9V3-NEXT: s_waitcnt vmcnt(0)
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
-; GFX9V3-NEXT: s_waitcnt vmcnt(0)
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc
; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: v_mov_b32_e32 v2, 0
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[6:7] glc
-; GFX9V4-NEXT: s_waitcnt vmcnt(0)
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
-; GFX9V4-NEXT: s_waitcnt vmcnt(0)
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[4:5] glc
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
; GFX9V5: ; %bb.0:
; GFX9V5-NEXT: v_mov_b32_e32 v2, 0
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[0:1] glc
-; GFX9V5-NEXT: s_waitcnt vmcnt(0)
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc
-; GFX9V5-NEXT: s_waitcnt vmcnt(0)
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc
; GFX9V5-NEXT: ; kill: killed $sgpr0_sgpr1
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX11-NEXT: scratch_load_b128 v[0:3], off, s32
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: scratch_store_b96 off, v[4:6], s32 offset:16
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:16
+; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, v3
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
$vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
$vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s128))
...
+# (global_load + scratch_load + buffer_load)
+---
+name: global_scratch_buffer
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
+ ; GFX9-LABEL: name: global_scratch_buffer
+ ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: S_WAITCNT 0
+ ; GFX9-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr2 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+...
+# waw between flat and buffer should have a wait inserted between.
+# (flat + buffer)
+---
+name: flat_buffer
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
+ ; GFX9-LABEL: name: flat_buffer
+ ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: S_WAITCNT 0
+ ; GFX9-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9-NEXT: S_WAITCNT 49279
+ ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+ $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+...
+# buffer + flat
+---
+name: buffer_flat
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
+ ; GFX9-LABEL: name: buffer_flat
+ ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: S_WAITCNT 0
+ ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+ ; GFX9-NEXT: S_WAITCNT 3952
+ ; GFX9-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+ $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+...