bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
SmallVector<MachineInstr *, 4> SetInactiveInstrs;
SmallVector<MachineInstr *, 4> SoftWQMInstrs;
+ bool HasImplicitDerivatives =
+ MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
// We need to visit the basic blocks in reverse post-order so that we visit
// defs before uses, in particular so that we don't accidentally mark an
// If LOD is not supported WQM is not needed.
if (!ST->hasExtendedImageInsts())
continue;
+ // Only generate implicit WQM if implicit derivatives are required.
+ // This avoids inserting unintended WQM if a shader type without
+ // implicit derivatives uses an image sampling instruction.
+ if (!HasImplicitDerivatives)
+ continue;
// Sampling instructions don't need to produce results for all pixels
// in a quad, they just require all inputs of a quad to have been
// computed for derivatives.
; GCN-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
; GCN-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
; GCN-NEXT: s_mov_b32 s18, -1
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
; GCN-NEXT: s_mov_b32 s19, 0xe00000
; GCN-NEXT: s_add_u32 s16, s16, s3
; GCN-NEXT: s_addc_u32 s17, s17, 0
-; GCN-NEXT: s_mov_b64 s[12:13], exec
-; GCN-NEXT: s_wqm_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000
-; GCN-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x24
-; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
; GCN-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: ;;#ASMSTART
-; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:4
; GCN-NEXT: s_brev_b32 s0, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s14
+; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: s_mov_b32 s3, 0
; GCN-NEXT: s_mov_b32 s1, s0
; GCN-NEXT: s_mov_b32 s2, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s15
-; GCN-NEXT: s_and_b64 exec, exec, s[12:13]
+; GCN-NEXT: v_mov_b32_e32 v1, s13
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:4
+; GCN-NEXT: s_nop 0
; GCN-NEXT: image_sample v0, v[0:1], s[4:11], s[0:3] dmask:0x1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f32_e32 v0, v2, v0
; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0
; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GCN-SCRATCH-NEXT: s_mov_b32 s9, exec_lo
-; GCN-SCRATCH-NEXT: s_wqm_b32 exec_lo, exec_lo
; GCN-SCRATCH-NEXT: s_clause 0x1
; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24
; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x44
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000
; GCN-SCRATCH-NEXT: s_brev_b32 s8, 1
+; GCN-SCRATCH-NEXT: s_mov_b32 s9, s8
; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off offset:4
; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-SCRATCH-NEXT: ;;#ASMSTART
; GCN-SCRATCH-NEXT: ;;#ASMEND
+; GCN-SCRATCH-NEXT: scratch_load_dword v2, off, off offset:4
; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, s10
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, s11
-; GCN-SCRATCH-NEXT: s_and_b32 exec_lo, exec_lo, s9
; GCN-SCRATCH-NEXT: s_mov_b32 s11, 0
-; GCN-SCRATCH-NEXT: s_mov_b32 s9, s8
; GCN-SCRATCH-NEXT: s_mov_b32 s10, s8
-; GCN-SCRATCH-NEXT: scratch_load_dword v2, off, off offset:4
; GCN-SCRATCH-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GCN-SCRATCH-NEXT: v_add_f32_e32 v0, v2, v0
; GFX9-LABEL: non_preserved_vgpr_tuple8:
; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9: v_mov_b32_e32 v36, v16
; GFX9-NEXT: v_mov_b32_e32 v35, v15
; GFX9-NEXT: v_mov_b32_e32 v34, v14
; GFX9-NEXT: v_mov_b32_e32 v33, v13
; GFX9-NEXT: v_mov_b32_e32 v32, v12
+
+; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+
; GFX9: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
; GFX9: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1
+; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
;
; GFX10-LABEL: non_preserved_vgpr_tuple8:
; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10: v_mov_b32_e32 v36, v16
; GFX10-NEXT: v_mov_b32_e32 v35, v15
; GFX10-NEXT: v_mov_b32_e32 v33, v13
; GFX10-NEXT: v_mov_b32_e32 v32, v12
+; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+
; GFX10: ;;#ASMSTART
; GFX10-NEXT: ;;#ASMEND
; GFX10: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: s_addk_i32 s32, 0x400
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX9-NEXT: v_mov_b32_e32 v41, v12
; GFX9: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1
+; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
-; GFX10-NEXT: v_mov_b32_e32 v41, v16
+; GFX10-NEXT: v_writelane_b32 v40, s30, 8
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v42, v15
-; GFX10-NEXT: v_mov_b32_e32 v43, v14
-; GFX10-NEXT: v_mov_b32_e32 v44, v13
-; GFX10-NEXT: v_mov_b32_e32 v45, v12
+; GFX10-NEXT: v_writelane_b32 v40, s31, 9
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-wqm -o - %s | FileCheck %s
+--- |
+ define amdgpu_ps void @test_strict_wwm_scc() {
+ ret void
+ }
+ define amdgpu_ps void @test_strict_wwm_scc2() {
+ ret void
+ }
+ define amdgpu_ps void @no_cfg() {
+ ret void
+ }
+ define amdgpu_ps void @copy_exec() {
+ ret void
+ }
+ define amdgpu_ps void @scc_always_live() {
+ ret void
+ }
+ define amdgpu_ps void @test_wwm_set_inactive_propagation() {
+ ret void
+ }
+ define amdgpu_ps void @test_wqm_lr_phi() {
+ ret void
+ }
+ define amdgpu_cs void @no_wqm_in_cs() {
+ ret void
+ }
+ define amdgpu_es void @no_wqm_in_es() {
+ ret void
+ }
+ define amdgpu_gs void @no_wqm_in_gs() {
+ ret void
+ }
+ define amdgpu_hs void @no_wqm_in_hs() {
+ ret void
+ }
+ define amdgpu_ls void @no_wqm_in_ls() {
+ ret void
+ }
+ define amdgpu_vs void @no_wqm_in_vs() {
+ ret void
+ }
+...
+---
+
---
# Check for awareness that s_or_saveexec_b64 clobbers SCC
#
$vgpr1 = COPY %4.sub1:vreg_128
SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
...
+
+---
+#CHECK-LABEL: name: no_wqm_in_cs
+#CHECK-NOT: S_WQM
+name: no_wqm_in_cs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr1, $vgpr2
+
+ undef %0.sub0:vreg_64 = COPY $vgpr1
+ %0.sub1:vreg_64 = COPY $vgpr2
+ %100:sgpr_256 = IMPLICIT_DEF
+ %101:sgpr_128 = IMPLICIT_DEF
+
+ %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...
+
+---
+#CHECK-LABEL: name: no_wqm_in_es
+#CHECK-NOT: S_WQM
+name: no_wqm_in_es
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr1, $vgpr2
+
+ undef %0.sub0:vreg_64 = COPY $vgpr1
+ %0.sub1:vreg_64 = COPY $vgpr2
+ %100:sgpr_256 = IMPLICIT_DEF
+ %101:sgpr_128 = IMPLICIT_DEF
+
+ %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...
+
+---
+#CHECK-LABEL: name: no_wqm_in_gs
+#CHECK-NOT: S_WQM
+name: no_wqm_in_gs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr1, $vgpr2
+
+ undef %0.sub0:vreg_64 = COPY $vgpr1
+ %0.sub1:vreg_64 = COPY $vgpr2
+ %100:sgpr_256 = IMPLICIT_DEF
+ %101:sgpr_128 = IMPLICIT_DEF
+
+ %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...
+
+---
+#CHECK-LABEL: name: no_wqm_in_hs
+#CHECK-NOT: S_WQM
+name: no_wqm_in_hs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr1, $vgpr2
+
+ undef %0.sub0:vreg_64 = COPY $vgpr1
+ %0.sub1:vreg_64 = COPY $vgpr2
+ %100:sgpr_256 = IMPLICIT_DEF
+ %101:sgpr_128 = IMPLICIT_DEF
+
+ %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...
+
+---
+#CHECK-LABEL: name: no_wqm_in_ls
+#CHECK-NOT: S_WQM
+name: no_wqm_in_ls
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr1, $vgpr2
+
+ undef %0.sub0:vreg_64 = COPY $vgpr1
+ %0.sub1:vreg_64 = COPY $vgpr2
+ %100:sgpr_256 = IMPLICIT_DEF
+ %101:sgpr_128 = IMPLICIT_DEF
+
+ %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...
+
+---
+#CHECK-LABEL: name: no_wqm_in_vs
+#CHECK-NOT: S_WQM
+name: no_wqm_in_vs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr1, $vgpr2
+
+ undef %0.sub0:vreg_64 = COPY $vgpr1
+ %0.sub1:vreg_64 = COPY $vgpr2
+ %100:sgpr_256 = IMPLICIT_DEF
+ %101:sgpr_128 = IMPLICIT_DEF
+
+ %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...