From ed4d8fdafdb5120b69fe2a8da418dc0c37f79ffc Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Tue, 18 Jan 2022 11:47:48 +0900 Subject: [PATCH] [AMDGPU] Autogenerate wqm.ll Switch wqm.ll to be autogenerated. Replace gfx6 and gfx8 targets with gfx9 (wave64) and gfx10 (wave32). Reviewed By: kmitropoulou Differential Revision: https://reviews.llvm.org/D117455 --- llvm/test/CodeGen/AMDGPU/wqm.ll | 2749 +++++++++++++++++++++++++++++++++------ 1 file changed, 2329 insertions(+), 420 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 9b19c33..fcab9ad 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -1,11 +1,26 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10-W32 %s ; Check that WQM isn't triggered by image load/store intrinsics. -; -;CHECK-LABEL: {{^}}test1: -;CHECK-NOT: s_wqm define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) { +; GFX9-W64-LABEL: test1: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-W64-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test1: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-W32-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0) call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0) @@ -13,18 +28,35 @@ main_body: } ; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible -; -;CHECK-LABEL: {{^}}test2: -;CHECK-NEXT: ; %main_body -;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK-NEXT: s_wqm_b64 exec, exec -;CHECK: interp -;CHECK: s_and_b64 exec, exec, [[ORIG]] -;CHECK-NOT: interp -;CHECK: image_sample -;CHECK-NOT: exec -;CHECK: .size test2 define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 { +; GFX9-W64-LABEL: test2: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_mov_b32 m0, s3 +; GFX9-W64-NEXT: s_nop 0 +; GFX9-W64-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x +; GFX9-W64-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y +; GFX9-W64-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x +; GFX9-W64-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test2: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_mov_b32 m0, s3 +; GFX10-W32-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x +; GFX10-W32-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y +; GFX10-W32-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x +; GFX10-W32-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %inst23 = extractelement <2 x float> %pos, i32 0 %inst24 = extractelement <2 x float> %pos, i32 1 @@ -37,17 +69,28 @@ main_body: } ; ... but disabled for stores (and, in this simple case, not re-enabled) ... -; -;CHECK-LABEL: {{^}}test3: -;CHECK-NEXT: ; %main_body -;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK-NEXT: s_wqm_b64 exec, exec -;CHECK: s_and_b64 exec, exec, [[ORIG]] -;CHECK: image_sample -;CHECK: store -;CHECK-NOT: exec -;CHECK: .size test3 define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) { +; GFX9-W64-LABEL: test3: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test3: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex.1 = bitcast <4 x float> %tex to <4 x i32> @@ -59,17 +102,37 @@ main_body: } ; ... and disabled for export. -; -;CHECK-LABEL: {{^}}test3x: -;CHECK-NEXT: ; %main_body -;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK-NEXT: s_wqm_b64 exec, exec -;CHECK: s_and_b64 exec, exec, [[ORIG]] -;CHECK: image_sample -;CHECK: exp -;CHECK-NOT: exec -;CHECK: .size test3x define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 { +; GFX9-W64-LABEL: test3x: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_mov_b32 m0, s3 +; GFX9-W64-NEXT: s_nop 0 +; GFX9-W64-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x +; GFX9-W64-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y +; GFX9-W64-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x +; GFX9-W64-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: exp mrt0 v0, v1, v2, v3 done vm +; GFX9-W64-NEXT: s_endpgm +; +; GFX10-W32-LABEL: test3x: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_mov_b32 m0, s3 +; GFX10-W32-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x +; GFX10-W32-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y +; GFX10-W32-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x +; GFX10-W32-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: exp mrt0 v0, v1, v2, v3 done vm +; GFX10-W32-NEXT: s_endpgm main_body: %inst23 = extractelement <2 x float> %pos, i32 0 %inst24 = extractelement <2 x float> %pos, i32 1 @@ -87,17 +150,34 @@ main_body: } ; Check that WQM is re-enabled when required. -; -;CHECK-LABEL: {{^}}test4: -;CHECK-NEXT: ; %main_body -;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK-NEXT: s_wqm_b64 exec, exec -;CHECK: v_mul_lo_u32 [[MUL:v[0-9]+]], v0, v1 -;CHECK: image_sample -;CHECK: s_and_b64 exec, exec, [[ORIG]] -;CHECK: image_sample -;CHECK: store define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) { +; GFX9-W64-LABEL: test4: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mul_lo_u32 v4, v0, v1 +; GFX9-W64-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test4: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mul_lo_u32 v4, v0, v1 +; GFX10-W32-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %c.1 = mul i32 %c, %d @@ -110,16 +190,38 @@ main_body: } ; Check that WQM is triggered by the wqm intrinsic. -; -;CHECK-LABEL: {{^}}test5: -;CHECK: s_wqm_b64 exec, exec -;CHECK: buffer_load_dword -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 ; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this ; does not happen - the v_add should write the return reg directly. -;CHECK-NOT: v_mov_b32_e32 define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) { +; GFX9-W64-LABEL: test5: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_nop 0 +; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test5: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-W32-NEXT: s_clause 0x1 +; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -129,13 +231,36 @@ main_body: } ; Check that the wqm intrinsic works correctly for integers. -; -;CHECK-LABEL: {{^}}test6: -;CHECK: s_wqm_b64 exec, exec -;CHECK: buffer_load_dword -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) { +; GFX9-W64-LABEL: test6: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_nop 0 +; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test6: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-W32-NEXT: s_clause 0x1 +; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -149,13 +274,34 @@ main_body: ; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead. ; Check that WWM is triggered by the wwm intrinsic. -; -;CHECK-LABEL: {{^}}test_wwm1: -;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -;CHECK: buffer_load_dword -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { +; GFX9-W64-LABEL: test_wwm1: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_nop 0 +; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_wwm1: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-W32-NEXT: s_clause 0x1 +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -165,13 +311,34 @@ main_body: } ; Same as above, but with an integer type. -; -;CHECK-LABEL: {{^}}test_wwm2: -;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -;CHECK: buffer_load_dword -;CHECK: buffer_load_dword -;CHECK: v_add_{{[iu]}}32_e32 define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) { +; GFX9-W64-LABEL: test_wwm2: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_nop 0 +; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_wwm2: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-W32-NEXT: s_clause 0x1 +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -188,16 +355,48 @@ main_body: ; in cases like this. ; We enforce this by checking that v_add gets emitted in the same block as ; WWM computations. -; -;CHECK-LABEL: {{^}}test_wwm3: -;CHECK: %if -;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 -;CHECK: s_mov_b64 exec, [[ORIG]] -;CHECK: v_add_f32_e32 -;CHECK: %endif define amdgpu_ps float @test_wwm3(i32 inreg %idx) { +; GFX9-W64-LABEL: test_wwm3: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-W64-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-W64-NEXT: ; %bb.1: ; %if +; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1 +; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-W64-NEXT: .LBB9_2: ; %endif +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_wwm3: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-W32-NEXT: s_cbranch_execz .LBB9_2 +; GFX10-W32-NEXT: ; %bb.1: ; %if +; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX10-W32-NEXT: .LBB9_2: ; %endif +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) @@ -221,16 +420,46 @@ endif: ; write could clobber disabled channels in the non-WWM one. ; We enforce this by checking that v_mov gets emitted in the same block as ; WWM computations. -; -;CHECK-LABEL: {{^}}test_wwm4: -;CHECK: %if -;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 -;CHECK: s_mov_b64 exec, [[ORIG]] -;CHECK-NEXT: v_mov_b32_e32 -;CHECK: %endif define amdgpu_ps float @test_wwm4(i32 inreg %idx) { +; GFX9-W64-LABEL: test_wwm4: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-W64-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-W64-NEXT: ; %bb.1: ; %if +; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-W64-NEXT: .LBB10_2: ; %endif +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_wwm4: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-W32-NEXT: s_cbranch_execz .LBB10_2 +; GFX10-W32-NEXT: ; %bb.1: ; %if +; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-W32-NEXT: .LBB10_2: ; %endif +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) @@ -250,16 +479,49 @@ endif: } ; Make sure the transition from Exact to WWM then WQM works properly. -; -;CHECK-LABEL: {{^}}test_wwm5: -;CHECK: buffer_load_dword -;CHECK: buffer_store_dword -;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 -;CHECK: s_mov_b64 exec, [[ORIG]] -;CHECK: s_wqm_b64 exec, exec define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) { +; GFX9-W64-LABEL: test_wwm5: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_wwm5: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo +; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) @@ -273,20 +535,56 @@ main_body: ; Check that WWM is turned on correctly across basic block boundaries. ; if..then..endif version -; -;CHECK-LABEL: {{^}}test_wwm6_then: -;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword -;CHECK: s_mov_b64 exec, [[ORIG]] -;CHECK: %if -;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword -;CHECK: v_add_f32_e32 -;CHECK: s_mov_b64 exec, [[ORIG2]] -;CHECK: %endif define amdgpu_ps float @test_wwm6_then() { +; GFX9-W64-LABEL: test_wwm6_then: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-W64-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-W64-NEXT: ; %bb.1: ; %if +; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-W64-NEXT: .LBB12_2: ; %endif +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_wwm6_then: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-W32-NEXT: s_cbranch_execz .LBB12_2 +; GFX10-W32-NEXT: ; %bb.1: ; %if +; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-W32-NEXT: .LBB12_2: ; %endif +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, float addrspace(1)* undef ; use mbcnt to make sure the branch is divergent @@ -308,19 +606,64 @@ endif: ; Check that WWM is turned on correctly across basic block boundaries. ; loop version -; -;CHECK-LABEL: {{^}}test_wwm6_loop: -;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword -;CHECK: s_mov_b64 exec, [[ORIG]] -;CHECK: %loop -;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword -;CHECK: s_mov_b64 exec, [[ORIG2]] -;CHECK: %endloop define amdgpu_ps float @test_wwm6_loop() { +; GFX9-W64-LABEL: test_wwm6_loop: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-W64-NEXT: .LBB13_1: ; %loop +; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: s_cbranch_execnz .LBB13_1 +; GFX9-W64-NEXT: ; %bb.2: ; %endloop +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_wwm6_loop: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s0, 0 +; GFX10-W32-NEXT: .LBB13_1: ; %loop +; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-W32-NEXT: ; %bb.2: ; %endloop +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, float addrspace(1)* undef ; use mbcnt to make sure the branch is divergent @@ -342,15 +685,36 @@ endloop: } ; Check that @llvm.amdgcn.set.inactive disables WWM. -; -;CHECK-LABEL: {{^}}test_wwm_set_inactive1: -;CHECK: buffer_load_dword -;CHECK: s_not_b64 exec, exec -;CHECK: v_mov_b32_e32 -;CHECK: s_not_b64 exec, exec -;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -;CHECK: v_add_{{[iu]}}32_e32 define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) { +; GFX9-W64-LABEL: test_wwm_set_inactive1: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_not_b64 exec, exec +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: s_not_b64 exec, exec +; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0 +; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-W64-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_endpgm +; +; GFX10-W32-LABEL: test_wwm_set_inactive1: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-W32-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_endpgm main_body: %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) %src.0 = bitcast float %src to i32 @@ -363,14 +727,36 @@ main_body: } ; Check that Strict WQM is triggered by the strict_wqm intrinsic. -; -;CHECK-LABEL: {{^}}test_strict_wqm1: -;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec -;CHECK: s_wqm_b64 exec, exec -;CHECK: buffer_load_dword -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) { +; GFX9-W64-LABEL: test_strict_wqm1: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_nop 0 +; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wqm1: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-W32-NEXT: s_clause 0x1 +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -380,14 +766,36 @@ main_body: } ; Same as above, but with an integer type. -; -;CHECK-LABEL: {{^}}test_strict_wqm2: -;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec -;CHECK: s_wqm_b64 exec, exec -;CHECK: buffer_load_dword -;CHECK: buffer_load_dword -;CHECK: v_add_{{[iu]}}32_e32 define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) { +; GFX9-W64-LABEL: test_strict_wqm2: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_nop 0 +; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wqm2: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-W32-NEXT: s_clause 0x1 +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -404,17 +812,50 @@ main_body: ; in cases like this. ; We enforce this by checking that v_add gets emitted in the same block as ; WWM computations. -; -;CHECK-LABEL: {{^}}test_strict_wqm3: -;CHECK: %if -;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK: s_wqm_b64 exec, exec -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 -;CHECK: s_mov_b64 exec, [[ORIG]] -;CHECK: v_add_f32_e32 -;CHECK: %endif define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) { +; GFX9-W64-LABEL: test_strict_wqm3: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-W64-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-W64-NEXT: ; %bb.1: ; %if +; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1 +; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-W64-NEXT: .LBB17_2: ; %endif +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wqm3: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-W32-NEXT: s_cbranch_execz .LBB17_2 +; GFX10-W32-NEXT: ; %bb.1: ; %if +; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX10-W32-NEXT: .LBB17_2: ; %endif +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) @@ -438,17 +879,48 @@ endif: ; the Strict WQM write could clobber disabled channels in the non-strict one. ; We enforce this by checking that v_mov gets emitted in the same block as ; WWM computations. -; -;CHECK-LABEL: {{^}}test_strict_wqm4: -;CHECK: %if -;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec -;CHECK: s_wqm_b64 exec, exec -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 -;CHECK: s_mov_b64 exec, [[ORIG]] -;CHECK-NEXT: v_mov_b32_e32 -;CHECK: %endif define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) { +; GFX9-W64-LABEL: test_strict_wqm4: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-W64-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-W64-NEXT: ; %bb.1: ; %if +; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-W64-NEXT: .LBB18_2: ; %endif +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wqm4: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-W32-NEXT: s_cbranch_execz .LBB18_2 +; GFX10-W32-NEXT: ; %bb.1: ; %if +; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-W32-NEXT: .LBB18_2: ; %endif +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) @@ -468,17 +940,52 @@ endif: } ; Make sure the transition from Exact to Strict WQM then WQM works properly. -; -;CHECK-LABEL: {{^}}test_strict_wqm5: -;CHECK: buffer_load_dword -;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK: buffer_store_dword -;CHECK: s_wqm_b64 exec, exec -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 -;CHECK: s_mov_b64 exec, [[ORIG]] -;CHECK: s_wqm_b64 exec, exec define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) { +; GFX9-W64-LABEL: test_strict_wqm5: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wqm5: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo +; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) @@ -492,22 +999,60 @@ main_body: ; Check that Strict WQM is turned on correctly across basic block boundaries. ; if..then..endif version -; -;CHECK-LABEL: {{^}}test_strict_wqm6_then: -;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK: s_wqm_b64 exec, exec ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword -;CHECK: s_mov_b64 exec, [[ORIG]] -;CHECK: %if -;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK: s_wqm_b64 exec, exec ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword -;CHECK: v_add_f32_e32 -;CHECK: s_mov_b64 exec, [[ORIG2]] -;CHECK: %endif define amdgpu_ps float @test_strict_wqm6_then() { +; GFX9-W64-LABEL: test_strict_wqm6_then: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-W64-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-W64-NEXT: ; %bb.1: ; %if +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-W64-NEXT: .LBB20_2: ; %endif +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wqm6_then: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-W32-NEXT: s_cbranch_execz .LBB20_2 +; GFX10-W32-NEXT: ; %bb.1: ; %if +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-W32-NEXT: .LBB20_2: ; %endif +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, float addrspace(1)* undef ; use mbcnt to make sure the branch is divergent @@ -529,21 +1074,70 @@ endif: ; Check that Strict WQM is turned on correctly across basic block boundaries. ; loop version -; -;CHECK-LABEL: {{^}}test_strict_wqm6_loop: -;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK: s_wqm_b64 exec, exec ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword -;CHECK: s_mov_b64 exec, [[ORIG]] -;CHECK: %loop -;CHECK: s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec -;CHECK: s_wqm_b64 exec, exec ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword -;CHECK: s_mov_b64 exec, [[ORIG2]] -;CHECK: %endloop define amdgpu_ps float @test_strict_wqm6_loop() { +; GFX9-W64-LABEL: test_strict_wqm6_loop: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-W64-NEXT: .LBB21_1: ; %loop +; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: s_cbranch_execnz .LBB21_1 +; GFX9-W64-NEXT: ; %bb.2: ; %endloop +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wqm6_loop: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s0, 0 +; GFX10-W32-NEXT: .LBB21_1: ; %loop +; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: s_cbranch_execnz .LBB21_1 +; GFX10-W32-NEXT: ; %bb.2: ; %endloop +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, float addrspace(1)* undef ; use mbcnt to make sure the branch is divergent @@ -565,12 +1159,40 @@ endloop: } ; Check that enabling WQM anywhere enables WQM for the set.inactive source. -; -;CHECK-LABEL: {{^}}test_set_inactive2: -;CHECK: s_wqm_b64 exec, exec -;CHECK: buffer_load_dword -;CHECK: buffer_load_dword define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) { +; GFX9-W64-LABEL: test_set_inactive2: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-W64-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_nop 0 +; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $scc killed $exec +; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_endpgm +; +; GFX10-W32-LABEL: test_set_inactive2: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-W32-NEXT: s_clause 0x1 +; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $scc killed $exec +; GFX10-W32-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec killed $exec +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_endpgm main_body: %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %src1.0 = bitcast float %src1 to i32 @@ -586,23 +1208,66 @@ main_body: ; Check a case of one branch of an if-else requiring WQM, the other requiring ; exact. -; ; Note: In this particular case, the save-and-restore could be avoided if the ; analysis understood that the two branches of the if-else are mutually ; exclusive. -; -;CHECK-LABEL: {{^}}test_control_flow_0: -;CHECK-NEXT: ; %main_body -;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK-NEXT: s_wqm_b64 exec, exec -;CHECK: %ELSE -;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]] -;CHECK: store -;CHECK: s_mov_b64 exec, [[SAVED]] -;CHECK: %IF -;CHECK: image_sample -;CHECK: image_sample define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { +; GFX9-W64-LABEL: test_control_flow_0: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; GFX9-W64-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-W64-NEXT: ; %bb.1: ; %ELSE +; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13] +; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: ; implicit-def: $vgpr0 +; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-W64-NEXT: .LBB23_2: ; %Flow +; GFX9-W64-NEXT: s_or_saveexec_b64 s[14:15], s[14:15] +; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-W64-NEXT: ; %bb.3: ; %IF +; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: .LBB23_4: ; %END +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_control_flow_0: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 +; GFX10-W32-NEXT: s_cbranch_execz .LBB23_2 +; GFX10-W32-NEXT: ; %bb.1: ; %ELSE +; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 +; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen +; GFX10-W32-NEXT: ; implicit-def: $vgpr0 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 +; GFX10-W32-NEXT: .LBB23_2: ; %Flow +; GFX10-W32-NEXT: s_or_saveexec_b32 s13, s13 +; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: s_cbranch_execz .LBB23_4 +; GFX10-W32-NEXT: ; %bb.3: ; %IF +; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: .LBB23_4: ; %END +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %cmp = icmp eq i32 %z, 0 br i1 %cmp, label %IF, label %ELSE @@ -625,27 +1290,63 @@ END: } ; Reverse branch order compared to the previous test. -; -;CHECK-LABEL: {{^}}test_control_flow_1: -;CHECK-NEXT: ; %main_body -;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK-NEXT: s_wqm_b64 exec, exec -;CHECK: %IF -;CHECK: image_sample -;CHECK: image_sample -;CHECK: %Flow -;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], -;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]] -;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]] -;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]] -;CHECK-NEXT: s_cbranch_execz [[END_BB:.LBB[0-9]+_[0-9]+]] -;CHECK-NEXT: ; %bb.{{[0-9]+}}: ; %ELSE -;CHECK: store_dword -;CHECK: [[END_BB]]: ; %END -;CHECK: s_or_b64 exec, exec, -;CHECK: v_mov_b32_e32 v0 -;CHECK: ; return define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { +; GFX9-W64-LABEL: test_control_flow_1: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; GFX9-W64-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-W64-NEXT: ; %bb.1: ; %IF +; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: ; implicit-def: $vgpr0 +; GFX9-W64-NEXT: .LBB24_2: ; %Flow +; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], s[14:15] +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: s_and_b64 s[0:1], exec, s[0:1] +; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-W64-NEXT: ; %bb.3: ; %ELSE +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: .LBB24_4: ; %END +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_control_flow_1: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 +; GFX10-W32-NEXT: s_cbranch_execz .LBB24_2 +; GFX10-W32-NEXT: ; %bb.1: ; %IF +; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: ; implicit-def: $vgpr0 +; GFX10-W32-NEXT: .LBB24_2: ; %Flow +; GFX10-W32-NEXT: s_or_saveexec_b32 s0, s13 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: s_and_b32 s0, exec_lo, s0 +; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: s_cbranch_execz .LBB24_4 +; GFX10-W32-NEXT: ; %bb.3: ; %ELSE +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen +; GFX10-W32-NEXT: .LBB24_4: ; %END +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %cmp = icmp eq i32 %z, 0 br i1 %cmp, label %ELSE, label %IF @@ -668,20 +1369,69 @@ END: } ; Check that branch conditions are properly marked as needing WQM... -; -;CHECK-LABEL: {{^}}test_control_flow_2: -;CHECK-NEXT: ; %main_body -;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK-NEXT: s_wqm_b64 exec, exec -;CHECK: s_and_b64 exec, exec, [[ORIG]] -;CHECK: store -;CHECK: s_wqm_b64 exec, exec -;CHECK: load -;CHECK: s_and_b64 exec, exec, [[ORIG]] -;CHECK: store -;CHECK: s_wqm_b64 exec, exec -;CHECK: v_cmp define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) { +; GFX9-W64-LABEL: test_control_flow_2: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: buffer_store_dword v3, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_waitcnt vmcnt(1) +; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; GFX9-W64-NEXT: ; implicit-def: $vgpr0 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; GFX9-W64-NEXT: ; %bb.1: ; %ELSE +; GFX9-W64-NEXT: v_lshlrev_b32_e32 v0, 2, v5 +; GFX9-W64-NEXT: ; implicit-def: $vgpr5 +; GFX9-W64-NEXT: ; %bb.2: ; %Flow +; GFX9-W64-NEXT: s_or_saveexec_b64 s[14:15], s[14:15] +; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: ; %bb.3: ; %IF +; GFX9-W64-NEXT: v_mul_lo_u32 v0, v5, 3 +; GFX9-W64-NEXT: ; %bb.4: ; %END +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_control_flow_2: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: buffer_store_dword v3, v0, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v0 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen +; GFX10-W32-NEXT: ; implicit-def: $vgpr0 +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 +; GFX10-W32-NEXT: ; %bb.1: ; %ELSE +; GFX10-W32-NEXT: v_lshlrev_b32_e32 v0, 2, v5 +; GFX10-W32-NEXT: ; implicit-def: $vgpr5 +; GFX10-W32-NEXT: ; %bb.2: ; %Flow +; GFX10-W32-NEXT: s_or_saveexec_b32 s13, s13 +; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: ; %bb.3: ; %IF +; GFX10-W32-NEXT: v_mul_lo_u32 v0, v5, 3 +; GFX10-W32-NEXT: ; %bb.4: ; %END +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %idx.1 = extractelement <3 x i32> %idx, i32 0 %data.1 = extractelement <2 x float> %data, i32 0 @@ -715,17 +1465,60 @@ END: } ; ... but only if they really do need it. -; -;CHECK-LABEL: {{^}}test_control_flow_3: -;CHECK-NEXT: ; %main_body -;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK-NEXT: s_wqm_b64 exec, exec -;CHECK: image_sample -;CHECK: s_and_b64 exec, exec, [[ORIG]] -;CHECK: image_sample -;CHECK-DAG: v_cmp -;CHECK-DAG: store define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) { +; GFX9-W64-LABEL: test_control_flow_3: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1 +; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: ; implicit-def: $vgpr0 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-W64-NEXT: ; %bb.1: ; %ELSE +; GFX9-W64-NEXT: v_mul_f32_e32 v0, 4.0, v1 +; GFX9-W64-NEXT: ; implicit-def: $vgpr1 +; GFX9-W64-NEXT: ; %bb.2: ; %Flow +; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: ; %bb.3: ; %IF +; GFX9-W64-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 +; GFX9-W64-NEXT: ; %bb.4: ; %END +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_control_flow_3: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen +; GFX10-W32-NEXT: ; implicit-def: $vgpr0 +; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX10-W32-NEXT: ; %bb.1: ; %ELSE +; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1 +; GFX10-W32-NEXT: ; implicit-def: $vgpr1 +; GFX10-W32-NEXT: ; %bb.2: ; %Flow +; GFX10-W32-NEXT: s_or_saveexec_b32 s0, s0 +; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: ; %bb.3: ; %IF +; GFX10-W32-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 +; GFX10-W32-NEXT: ; %bb.4: ; %END +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 @@ -750,20 +1543,53 @@ END: } ; Another test that failed at some point because of terminator handling. -; -;CHECK-LABEL: {{^}}test_control_flow_4: -;CHECK-NEXT: ; %main_body -;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK-NEXT: s_wqm_b64 exec, exec -;CHECK: %IF -;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]] -;CHECK: load -;CHECK: store -;CHECK: s_mov_b64 exec, [[SAVE]] -;CHECK: %END -;CHECK: image_sample -;CHECK: image_sample define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) { +; GFX9-W64-LABEL: test_control_flow_4: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-W64-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-W64-NEXT: ; %bb.1: ; %IF +; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13] +; GFX9-W64-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, 1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-W64-NEXT: .LBB27_2: ; %END +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_control_flow_4: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_cbranch_execz .LBB27_2 +; GFX10-W32-NEXT: ; %bb.1: ; %IF +; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 +; GFX10-W32-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, 1 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 +; GFX10-W32-NEXT: .LBB27_2: ; %END +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %cond = icmp eq i32 %y, 0 br i1 %cond, label %IF, label %END @@ -781,21 +1607,71 @@ END: } ; Kill is performed in WQM mode so that uniform kill behaves correctly ... -; -;CHECK-LABEL: {{^}}test_kill_0: -;CHECK-NEXT: ; %main_body -;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK-NEXT: s_wqm_b64 exec, exec -;CHECK: s_and_b64 exec, exec, [[ORIG]] -;CHECK: image_sample -;CHECK: buffer_store_dword -;CHECK: s_wqm_b64 exec, exec -;CHECK: v_cmp_ -;CHECK: image_sample -;CHECK: s_and_b64 exec, exec, [[ORIG]] -;CHECK: image_sample -;CHECK: buffer_store_dword define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) { +; GFX9-W64-LABEL: test_kill_0: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf +; GFX9-W64-NEXT: s_nop 0 +; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v6 +; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB28_2 +; GFX9-W64-NEXT: ; %bb.1: ; %main_body +; GFX9-W64-NEXT: s_andn2_b64 exec, exec, vcc +; GFX9-W64-NEXT: image_sample v0, v5, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v0, v7, v11 +; GFX9-W64-NEXT: buffer_store_dword v3, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: v_add_f32_e32 v1, v8, v12 +; GFX9-W64-NEXT: v_add_f32_e32 v2, v9, v13 +; GFX9-W64-NEXT: v_add_f32_e32 v3, v10, v14 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: s_branch .LBB28_3 +; GFX9-W64-NEXT: .LBB28_2: +; GFX9-W64-NEXT: s_mov_b64 exec, 0 +; GFX9-W64-NEXT: exp null off, off, off, off done vm +; GFX9-W64-NEXT: s_endpgm +; GFX9-W64-NEXT: .LBB28_3: +; +; GFX10-W32-LABEL: test_kill_0: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v6 +; GFX10-W32-NEXT: s_andn2_b32 s12, s12, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB28_2 +; GFX10-W32-NEXT: ; %bb.1: ; %main_body +; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo +; GFX10-W32-NEXT: image_sample v0, v5, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: buffer_store_dword v3, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v4, v8, v12 +; GFX10-W32-NEXT: v_add_f32_e32 v5, v10, v14 +; GFX10-W32-NEXT: v_add_f32_e32 v0, v7, v11 +; GFX10-W32-NEXT: v_add_f32_e32 v2, v9, v13 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-W32-NEXT: v_mov_b32_e32 v3, v5 +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: s_branch .LBB28_3 +; GFX10-W32-NEXT: .LBB28_2: +; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-W32-NEXT: exp null off, off, off, off done vm +; GFX10-W32-NEXT: s_endpgm +; GFX10-W32-NEXT: .LBB28_3: main_body: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %idx.0 = extractelement <2 x i32> %idx, i32 0 @@ -817,7 +1693,6 @@ main_body: } ; ... but only if WQM is necessary. -; ; CHECK-LABEL: {{^}}test_kill_1: ; CHECK-NEXT: ; %main_body ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec @@ -829,6 +1704,54 @@ main_body: ; CHECK-DAG: buffer_store_dword ; CHECK-DAG: v_cmp_ define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; GFX9-W64-LABEL: test_kill_1: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec +; GFX9-W64-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-W64-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-W64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v4 +; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], vcc +; GFX9-W64-NEXT: buffer_store_dword v5, off, s[0:3], 0 +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB29_2 +; GFX9-W64-NEXT: ; %bb.1: ; %main_body +; GFX9-W64-NEXT: s_andn2_b64 exec, exec, vcc +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: s_branch .LBB29_3 +; GFX9-W64-NEXT: .LBB29_2: +; GFX9-W64-NEXT: s_mov_b64 exec, 0 +; GFX9-W64-NEXT: exp null off, off, off, off done vm +; GFX9-W64-NEXT: s_endpgm +; GFX9-W64-NEXT: .LBB29_3: +; +; GFX10-W32-LABEL: test_kill_1: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-W32-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v4 +; GFX10-W32-NEXT: buffer_store_dword v5, off, s[0:3], 0 +; GFX10-W32-NEXT: s_andn2_b32 s12, s12, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB29_2 +; GFX10-W32-NEXT: ; %bb.1: ; %main_body +; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: s_branch .LBB29_3 +; GFX10-W32-NEXT: .LBB29_2: +; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-W32-NEXT: exp null off, off, off, off done vm +; GFX10-W32-NEXT: s_endpgm +; GFX10-W32-NEXT: .LBB29_3: main_body: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 @@ -843,13 +1766,27 @@ main_body: } ; Check prolog shaders. -; ; CHECK-LABEL: {{^}}test_prolog_1: ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec ; CHECK: s_wqm_b64 exec, exec ; CHECK: v_add_f32_e32 v0, ; CHECK: s_and_b64 exec, exec, [[ORIG]] define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 { +; GFX9-W64-LABEL: test_prolog_1: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_prolog_1: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %s = fadd float %a, %b ret float %s @@ -878,6 +1815,114 @@ main_body: ; CHECK: ; %break ; CHECK: ; return define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { +; GFX9-W64-LABEL: test_loop_vcc: +; GFX9-W64: ; %bb.0: ; %entry +; GFX9-W64-NEXT: s_mov_b64 s[8:9], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-W64-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-W64-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-W64-NEXT: s_mov_b32 s0, 0 +; GFX9-W64-NEXT: s_mov_b32 s1, s0 +; GFX9-W64-NEXT: s_mov_b32 s2, s0 +; GFX9-W64-NEXT: s_mov_b32 s3, s0 +; GFX9-W64-NEXT: s_mov_b32 s4, s0 +; GFX9-W64-NEXT: s_mov_b32 s5, s0 +; GFX9-W64-NEXT: s_mov_b32 s6, s0 +; GFX9-W64-NEXT: s_mov_b32 s7, s0 +; GFX9-W64-NEXT: image_store v[4:7], v0, s[0:7] dmask:0xf unorm +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-W64-NEXT: s_mov_b32 s10, 0x40e00000 +; GFX9-W64-NEXT: s_branch .LBB31_2 +; GFX9-W64-NEXT: .LBB31_1: ; %body +; GFX9-W64-NEXT: ; in Loop: Header=BB31_2 Depth=1 +; GFX9-W64-NEXT: s_mov_b32 s1, s0 +; GFX9-W64-NEXT: s_mov_b32 s2, s0 +; GFX9-W64-NEXT: s_mov_b32 s3, s0 +; GFX9-W64-NEXT: s_mov_b32 s4, s0 +; GFX9-W64-NEXT: s_mov_b32 s5, s0 +; GFX9-W64-NEXT: s_mov_b32 s6, s0 +; GFX9-W64-NEXT: s_mov_b32 s7, s0 +; GFX9-W64-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf +; GFX9-W64-NEXT: v_add_f32_e32 v8, 2.0, v8 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-W64-NEXT: s_cbranch_execz .LBB31_4 +; GFX9-W64-NEXT: .LBB31_2: ; %loop +; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s10, v8 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7 +; GFX9-W64-NEXT: s_and_b64 vcc, exec, vcc +; GFX9-W64-NEXT: s_cbranch_vccz .LBB31_1 +; GFX9-W64-NEXT: ; %bb.3: +; GFX9-W64-NEXT: s_mov_b64 s[2:3], -1 +; GFX9-W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-W64-NEXT: ; implicit-def: $vgpr8 +; GFX9-W64-NEXT: .LBB31_4: ; %break +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_loop_vcc: +; GFX10-W32: ; %bb.0: ; %entry +; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-W32-NEXT: s_mov_b32 s0, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, s0 +; GFX10-W32-NEXT: s_mov_b32 s2, s0 +; GFX10-W32-NEXT: s_mov_b32 s3, s0 +; GFX10-W32-NEXT: s_mov_b32 s4, s0 +; GFX10-W32-NEXT: s_mov_b32 s5, s0 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 +; GFX10-W32-NEXT: s_mov_b32 s6, s0 +; GFX10-W32-NEXT: s_mov_b32 s7, s0 +; GFX10-W32-NEXT: image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_branch .LBB31_2 +; GFX10-W32-NEXT: .p2align 6 +; GFX10-W32-NEXT: .LBB31_1: ; %body +; GFX10-W32-NEXT: ; in Loop: Header=BB31_2 Depth=1 +; GFX10-W32-NEXT: s_mov_b32 s1, s0 +; GFX10-W32-NEXT: s_mov_b32 s2, s0 +; GFX10-W32-NEXT: s_mov_b32 s3, s0 +; GFX10-W32-NEXT: s_mov_b32 s4, s0 +; GFX10-W32-NEXT: s_mov_b32 s5, s0 +; GFX10-W32-NEXT: s_mov_b32 s6, s0 +; GFX10-W32-NEXT: s_mov_b32 s7, s0 +; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8 +; GFX10-W32-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_mov_b32 s1, 0 +; GFX10-W32-NEXT: s_cbranch_execz .LBB31_4 +; GFX10-W32-NEXT: .LBB31_2: ; %loop +; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-W32-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_mov_b32_e32 v7, v3 +; GFX10-W32-NEXT: v_mov_b32_e32 v6, v2 +; GFX10-W32-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-W32-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_vccz .LBB31_1 +; GFX10-W32-NEXT: ; %bb.3: +; GFX10-W32-NEXT: s_mov_b32 s1, -1 +; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX10-W32-NEXT: ; implicit-def: $vgpr8 +; GFX10-W32-NEXT: .LBB31_4: ; %break +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, v6 +; GFX10-W32-NEXT: v_mov_b32_e32 v3, v7 +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: ; return to shader part epilog entry: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0) br label %loop @@ -900,7 +1945,6 @@ break: ; Only intrinsic stores need exact execution -- other stores do not have ; externally visible effects and may require WQM for correctness. -; ; CHECK-LABEL: {{^}}test_alloca: ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec ; CHECK: s_wqm_b64 exec, exec @@ -918,6 +1962,78 @@ break: ; CHECK: image_sample ; CHECK: buffer_store_dwordx4 define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { +; GFX9-W64-LABEL: test_alloca: +; GFX9-W64: ; %bb.0: ; %entry +; GFX9-W64-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-W64-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-W64-NEXT: s_mov_b32 s10, -1 +; GFX9-W64-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-W64-NEXT: s_add_u32 s8, s8, s0 +; GFX9-W64-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-W64-NEXT: v_lshl_add_u32 v0, v2, 2, v0 +; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: s_mov_b32 s0, 0 +; GFX9-W64-NEXT: s_mov_b32 s1, s0 +; GFX9-W64-NEXT: s_mov_b32 s2, s0 +; GFX9-W64-NEXT: s_mov_b32 s3, s0 +; GFX9-W64-NEXT: s_mov_b32 s4, s0 +; GFX9-W64-NEXT: s_mov_b32 s5, s0 +; GFX9-W64-NEXT: s_mov_b32 s6, s0 +; GFX9-W64-NEXT: s_mov_b32 s7, s0 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-W64-NEXT: s_endpgm +; +; GFX10-W32-LABEL: test_alloca: +; GFX10-W32: ; %bb.0: ; %entry +; GFX10-W32-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-W32-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-W32-NEXT: s_mov_b32 s10, -1 +; GFX10-W32-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-W32-NEXT: s_add_u32 s8, s8, s0 +; GFX10-W32-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v3, 1 +; GFX10-W32-NEXT: v_lshl_add_u32 v2, v2, 2, 4 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: buffer_store_dword v0, v3, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: buffer_load_dword v0, v2, s[8:11], 0 offen +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: s_mov_b32 s0, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, s0 +; GFX10-W32-NEXT: s_mov_b32 s2, s0 +; GFX10-W32-NEXT: s_mov_b32 s3, s0 +; GFX10-W32-NEXT: s_mov_b32 s4, s0 +; GFX10-W32-NEXT: s_mov_b32 s5, s0 +; GFX10-W32-NEXT: s_mov_b32 s6, s0 +; GFX10-W32-NEXT: s_mov_b32 s7, s0 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX10-W32-NEXT: s_endpgm entry: %array = alloca [32 x i32], align 4, addrspace(5) @@ -941,14 +2057,49 @@ entry: ; otherwise the EXEC mask exported by the epilog will be wrong. This is true ; even if the shader has no kills, because a kill could have happened in a ; previous shader fragment. -; ; CHECK-LABEL: {{^}}test_nonvoid_return: ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec ; CHECK: s_wqm_b64 exec, exec -; ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK-NOT: exec define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { +; GFX9-W64-LABEL: test_nonvoid_return: +; GFX9-W64: ; %bb.0: +; GFX9-W64-NEXT: s_mov_b32 s0, 0 +; GFX9-W64-NEXT: s_mov_b64 s[8:9], exec +; GFX9-W64-NEXT: s_mov_b32 s1, s0 +; GFX9-W64-NEXT: s_mov_b32 s2, s0 +; GFX9-W64-NEXT: s_mov_b32 s3, s0 +; GFX9-W64-NEXT: s_mov_b32 s4, s0 +; GFX9-W64-NEXT: s_mov_b32 s5, s0 +; GFX9-W64-NEXT: s_mov_b32 s6, s0 +; GFX9-W64-NEXT: s_mov_b32 s7, s0 +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_nonvoid_return: +; GFX10-W32: ; %bb.0: +; GFX10-W32-NEXT: s_mov_b32 s0, 0 +; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s1, s0 +; GFX10-W32-NEXT: s_mov_b32 s2, s0 +; GFX10-W32-NEXT: s_mov_b32 s3, s0 +; GFX10-W32-NEXT: s_mov_b32 s4, s0 +; GFX10-W32-NEXT: s_mov_b32 s5, s0 +; GFX10-W32-NEXT: s_mov_b32 s6, s0 +; GFX10-W32-NEXT: s_mov_b32 s7, s0 +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: ; return to shader part epilog %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 @@ -958,10 +2109,62 @@ define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { ; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable: ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec ; CHECK: s_wqm_b64 exec, exec -; ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK-NOT: exec define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind { +; GFX9-W64-LABEL: test_nonvoid_return_unreachable: +; GFX9-W64: ; %bb.0: ; %entry +; GFX9-W64-NEXT: s_mov_b32 s4, 0 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: s_mov_b32 s5, s4 +; GFX9-W64-NEXT: s_mov_b32 s6, s4 +; GFX9-W64-NEXT: s_mov_b32 s7, s4 +; GFX9-W64-NEXT: s_mov_b32 s8, s4 +; GFX9-W64-NEXT: s_mov_b32 s9, s4 +; GFX9-W64-NEXT: s_mov_b32 s10, s4 +; GFX9-W64-NEXT: s_mov_b32 s11, s4 +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: image_sample v0, v0, s[4:11], s[0:3] dmask:0x1 +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf +; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1 +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB34_2 +; GFX9-W64-NEXT: ; %bb.1: ; %else +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: s_branch .LBB34_3 +; GFX9-W64-NEXT: .LBB34_2: ; %if +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: .LBB34_3: +; +; GFX10-W32-LABEL: test_nonvoid_return_unreachable: +; GFX10-W32: ; %bb.0: ; %entry +; GFX10-W32-NEXT: s_mov_b32 s4, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s5, s4 +; GFX10-W32-NEXT: s_mov_b32 s6, s4 +; GFX10-W32-NEXT: s_mov_b32 s7, s4 +; GFX10-W32-NEXT: s_mov_b32 s8, s4 +; GFX10-W32-NEXT: s_mov_b32 s9, s4 +; GFX10-W32-NEXT: s_mov_b32 s10, s4 +; GFX10-W32-NEXT: s_mov_b32 s11, s4 +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: image_sample v0, v0, s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1 +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB34_2 +; GFX10-W32-NEXT: ; %bb.1: ; %else +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: s_branch .LBB34_3 +; GFX10-W32-NEXT: .LBB34_2: ; %if +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: .LBB34_3: entry: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 @@ -978,7 +2181,6 @@ else: } ; Test awareness that s_wqm_b64 clobbers SCC. -; ; CHECK-LABEL: {{^}}test_scc: ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec ; CHECK: s_wqm_b64 exec, exec @@ -991,6 +2193,90 @@ else: ; CHECK: ; %end ; CHECK: s_and_b64 exec, exec, [[ORIG]] define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 { +; GFX9-W64-LABEL: test_scc: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1 +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB35_2 +; GFX9-W64-NEXT: ; %bb.1: ; %else +; GFX9-W64-NEXT: s_mov_b32 s4, 0 +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-W64-NEXT: s_mov_b32 s5, s4 +; GFX9-W64-NEXT: s_mov_b32 s6, s4 +; GFX9-W64-NEXT: s_mov_b32 s7, s4 +; GFX9-W64-NEXT: s_mov_b32 s8, s4 +; GFX9-W64-NEXT: s_mov_b32 s9, s4 +; GFX9-W64-NEXT: s_mov_b32 s10, s4 +; GFX9-W64-NEXT: s_mov_b32 s11, s4 +; GFX9-W64-NEXT: image_sample v[0:3], v[0:1], s[4:11], s[0:3] dmask:0xf +; GFX9-W64-NEXT: s_cbranch_execz .LBB35_3 +; GFX9-W64-NEXT: s_branch .LBB35_4 +; GFX9-W64-NEXT: .LBB35_2: +; GFX9-W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX9-W64-NEXT: .LBB35_3: ; %if +; GFX9-W64-NEXT: s_mov_b32 s4, 0 +; GFX9-W64-NEXT: s_mov_b32 s5, s4 +; GFX9-W64-NEXT: s_mov_b32 s6, s4 +; GFX9-W64-NEXT: s_mov_b32 s7, s4 +; GFX9-W64-NEXT: s_mov_b32 s8, s4 +; GFX9-W64-NEXT: s_mov_b32 s9, s4 +; GFX9-W64-NEXT: s_mov_b32 s10, s4 +; GFX9-W64-NEXT: s_mov_b32 s11, s4 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf +; GFX9-W64-NEXT: .LBB35_4: ; %end +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: v_mov_b32_e32 v5, 1.0 +; GFX9-W64-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_scc: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1 +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB35_2 +; GFX10-W32-NEXT: ; %bb.1: ; %else +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-W32-NEXT: s_mov_b32 s0, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, s0 +; GFX10-W32-NEXT: s_mov_b32 s2, s0 +; GFX10-W32-NEXT: s_mov_b32 s3, s0 +; GFX10-W32-NEXT: s_mov_b32 s4, s0 +; GFX10-W32-NEXT: s_mov_b32 s5, s0 +; GFX10-W32-NEXT: s_mov_b32 s6, s0 +; GFX10-W32-NEXT: s_mov_b32 s7, s0 +; GFX10-W32-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-W32-NEXT: s_cbranch_execz .LBB35_3 +; GFX10-W32-NEXT: s_branch .LBB35_4 +; GFX10-W32-NEXT: .LBB35_2: +; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX10-W32-NEXT: .LBB35_3: ; %if +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: s_mov_b32 s0, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, s0 +; GFX10-W32-NEXT: s_mov_b32 s2, s0 +; GFX10-W32-NEXT: s_mov_b32 s3, s0 +; GFX10-W32-NEXT: s_mov_b32 s4, s0 +; GFX10-W32-NEXT: s_mov_b32 s5, s0 +; GFX10-W32-NEXT: s_mov_b32 s6, s0 +; GFX10-W32-NEXT: s_mov_b32 s7, s0 +; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: .LBB35_4: ; %end +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 +; GFX10-W32-NEXT: v_mov_b32_e32 v5, 1.0 +; GFX10-W32-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %cc = icmp sgt i32 %sel, 0 br i1 %cc, label %if, label %else @@ -1011,13 +2297,64 @@ end: ; Check a case of a block being entirely WQM except for a bit of WWM. ; There was a bug where it forgot to enter and leave WWM. -; -;CHECK-LABEL: {{^}}test_wwm_within_wqm: -;CHECK: %IF -;CHECK: s_or_saveexec_b64 {{.*}}, -1 -;CHECK: ds_swizzle -; define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { +; GFX9-W64-LABEL: test_wwm_within_wqm: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-W64-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-W64-NEXT: ; %bb.1: ; %IF +; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v0 +; GFX9-W64-NEXT: s_not_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-W64-NEXT: s_not_b64 exec, exec +; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) +; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0 +; GFX9-W64-NEXT: .LBB36_2: ; %ENDIF +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_wwm_within_wqm: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_cbranch_execz .LBB36_2 +; GFX10-W32-NEXT: ; %bb.1: ; %IF +; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v0 +; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0 +; GFX10-W32-NEXT: .LBB36_2: ; %ENDIF +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %c.bc = bitcast i32 %c to float %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 @@ -1041,13 +2378,34 @@ ENDIF: } ; Check that WWM is triggered by the strict_wwm intrinsic. -; -;CHECK-LABEL: {{^}}test_strict_wwm1: -;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -;CHECK: buffer_load_dword -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) { +; GFX9-W64-LABEL: test_strict_wwm1: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_nop 0 +; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wwm1: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-W32-NEXT: s_clause 0x1 +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -1057,13 +2415,34 @@ main_body: } ; Same as above, but with an integer type. -; -;CHECK-LABEL: {{^}}test_strict_wwm2: -;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -;CHECK: buffer_load_dword -;CHECK: buffer_load_dword -;CHECK: v_add_{{[iu]}}32_e32 define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) { +; GFX9-W64-LABEL: test_strict_wwm2: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_nop 0 +; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wwm2: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-W32-NEXT: s_clause 0x1 +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -1080,16 +2459,48 @@ main_body: ; in cases like this. ; We enforce this by checking that v_add gets emitted in the same block as ; WWM computations. -; -;CHECK-LABEL: {{^}}test_strict_wwm3: -;CHECK: %if -;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 -;CHECK: s_mov_b64 exec, [[ORIG]] -;CHECK: v_add_f32_e32 -;CHECK: %endif define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) { +; GFX9-W64-LABEL: test_strict_wwm3: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-W64-NEXT: s_cbranch_execz .LBB39_2 +; GFX9-W64-NEXT: ; %bb.1: ; %if +; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1 +; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-W64-NEXT: .LBB39_2: ; %endif +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wwm3: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-W32-NEXT: s_cbranch_execz .LBB39_2 +; GFX10-W32-NEXT: ; %bb.1: ; %if +; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX10-W32-NEXT: .LBB39_2: ; %endif +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) @@ -1113,16 +2524,46 @@ endif: ; write could clobber disabled channels in the non-WWM one. ; We enforce this by checking that v_mov gets emitted in the same block as ; WWM computations. -; -;CHECK-LABEL: {{^}}test_strict_wwm4: -;CHECK: %if -;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 -;CHECK: s_mov_b64 exec, [[ORIG]] -;CHECK-NEXT: v_mov_b32_e32 -;CHECK: %endif define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) { +; GFX9-W64-LABEL: test_strict_wwm4: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-W64-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-W64-NEXT: ; %bb.1: ; %if +; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-W64-NEXT: .LBB40_2: ; %endif +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wwm4: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-W32-NEXT: s_cbranch_execz .LBB40_2 +; GFX10-W32-NEXT: ; %bb.1: ; %if +; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-W32-NEXT: .LBB40_2: ; %endif +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) @@ -1142,16 +2583,49 @@ endif: } ; Make sure the transition from Exact to WWM then WQM works properly. -; -;CHECK-LABEL: {{^}}test_strict_wwm5: -;CHECK: buffer_load_dword -;CHECK: buffer_store_dword -;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 -;CHECK: s_mov_b64 exec, [[ORIG]] -;CHECK: s_wqm_b64 exec, exec define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) { +; GFX9-W64-LABEL: test_strict_wwm5: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wwm5: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo +; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) @@ -1165,20 +2639,56 @@ main_body: ; Check that WWM is turned on correctly across basic block boundaries. ; if..then..endif version -; -;CHECK-LABEL: {{^}}test_strict_wwm6_then: -;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword -;CHECK: s_mov_b64 exec, [[ORIG]] -;CHECK: %if -;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword -;CHECK: v_add_f32_e32 -;CHECK: s_mov_b64 exec, [[ORIG2]] -;CHECK: %endif define amdgpu_ps float @test_strict_wwm6_then() { +; GFX9-W64-LABEL: test_strict_wwm6_then: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-W64-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-W64-NEXT: ; %bb.1: ; %if +; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-W64-NEXT: .LBB42_2: ; %endif +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wwm6_then: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-W32-NEXT: s_cbranch_execz .LBB42_2 +; GFX10-W32-NEXT: ; %bb.1: ; %if +; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-W32-NEXT: .LBB42_2: ; %endif +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, float addrspace(1)* undef ; use mbcnt to make sure the branch is divergent @@ -1200,19 +2710,60 @@ endif: ; Check that WWM is turned on correctly across basic block boundaries. ; loop version -; -;CHECK-LABEL: {{^}}test_strict_wwm6_loop: -;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 -;SI-CHECK: buffer_load_dword -;VI-CHECK: flat_load_dword -;CHECK: s_mov_b64 exec, [[ORIG]] -;CHECK: %loop -;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 -;SI-CHECK: buffer_load_dword -;VI-CHECK: flat_load_dword -;CHECK: s_mov_b64 exec, [[ORIG2]] -;CHECK: %endloop define amdgpu_ps float @test_strict_wwm6_loop() { +; GFX9-W64-LABEL: test_strict_wwm6_loop: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-W64-NEXT: .LBB43_1: ; %loop +; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: s_cbranch_execnz .LBB43_1 +; GFX9-W64-NEXT: ; %bb.2: ; %endloop +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wwm6_loop: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s0, 0 +; GFX10-W32-NEXT: .LBB43_1: ; %loop +; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: s_cbranch_execnz .LBB43_1 +; GFX10-W32-NEXT: ; %bb.2: ; %endloop +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, float addrspace(1)* undef ; use mbcnt to make sure the branch is divergent @@ -1234,15 +2785,36 @@ endloop: } ; Check that @llvm.amdgcn.set.inactive disables WWM. -; -;CHECK-LABEL: {{^}}test_strict_wwm_set_inactive1: -;CHECK: buffer_load_dword -;CHECK: s_not_b64 exec, exec -;CHECK: v_mov_b32_e32 -;CHECK: s_not_b64 exec, exec -;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -;CHECK: v_add_{{[iu]}}32_e32 define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) { +; GFX9-W64-LABEL: test_strict_wwm_set_inactive1: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_not_b64 exec, exec +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: s_not_b64 exec, exec +; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0 +; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-W64-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_endpgm +; +; GFX10-W32-LABEL: test_strict_wwm_set_inactive1: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-W32-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_endpgm main_body: %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) %src.0 = bitcast float %src to i32 @@ -1256,13 +2828,64 @@ main_body: ; Check a case of a block being entirely WQM except for a bit of WWM. ; There was a bug where it forgot to enter and leave WWM. -; -;CHECK-LABEL: {{^}}test_strict_wwm_within_wqm: -;CHECK: %IF -;CHECK: s_or_saveexec_b64 {{.*}}, -1 -;CHECK: ds_swizzle -; define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { +; GFX9-W64-LABEL: test_strict_wwm_within_wqm: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-W64-NEXT: s_cbranch_execz .LBB45_2 +; GFX9-W64-NEXT: ; %bb.1: ; %IF +; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v0 +; GFX9-W64-NEXT: s_not_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-W64-NEXT: s_not_b64 exec, exec +; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) +; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0 +; GFX9-W64-NEXT: .LBB45_2: ; %ENDIF +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wwm_within_wqm: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_cbranch_execz .LBB45_2 +; GFX10-W32-NEXT: ; %bb.1: ; %IF +; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v0 +; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0 +; GFX10-W32-NEXT: .LBB45_2: ; %ENDIF +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %c.bc = bitcast i32 %c to float %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 @@ -1286,14 +2909,66 @@ ENDIF: } ; Check a case of a block being entirely WQM except for a bit of STRICT WQM. -; -;CHECK-LABEL: {{^}}test_strict_wqm_within_wqm: -;CHECK: %IF -;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec -;CHECK: s_wqm_b64 exec, exec -;CHECK: ds_swizzle -; define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { +; GFX9-W64-LABEL: test_strict_wqm_within_wqm: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15] +; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-W64-NEXT: s_cbranch_execz .LBB46_2 +; GFX9-W64-NEXT: ; %bb.1: ; %IF +; GFX9-W64-NEXT: s_mov_b64 s[16:17], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) +; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX9-W64-NEXT: .LBB46_2: ; %ENDIF +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wqm_within_wqm: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13 +; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2 +; GFX10-W32-NEXT: ; %bb.1: ; %IF +; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 +; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX10-W32-NEXT: .LBB46_2: ; %ENDIF +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %c.bc = bitcast i32 %c to float %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 @@ -1315,28 +2990,110 @@ ENDIF: ret float %r } -;CHECK-LABEL: {{^}}test_strict_wqm_strict_wwm_wqm: -;CHECK: buffer_store_dword - -;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK: s_wqm_b64 exec, exec -;CHECK: buffer_load_dword -;CHECK: s_mov_b64 exec, [[ORIG]] - -;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 -;CHECK: buffer_load_dword -;CHECK: s_mov_b64 exec, [[ORIG2]] - -;CHECK: s_mov_b64 [[ORIG3:s\[[0-9]+:[0-9]+\]]], exec -;CHECK: s_wqm_b64 exec, exec -;CHECK: v_add -;CHECK: s_mov_b64 exec, [[ORIG3]] - ;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again. -;CHECK: s_wqm_b64 exec, exec -;CHECK: image_sample - define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, <4 x i32> inreg %res2, float %inp, <8 x i32> inreg %res3) { +; GFX9-W64-LABEL: test_strict_wqm_strict_wwm_wqm: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[28:29], exec +; GFX9-W64-NEXT: s_mov_b32 s19, s17 +; GFX9-W64-NEXT: s_mov_b64 s[30:31], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_mov_b32 s23, s5 +; GFX9-W64-NEXT: s_mov_b32 s22, s4 +; GFX9-W64-NEXT: s_mov_b32 s21, s3 +; GFX9-W64-NEXT: s_mov_b32 s20, s2 +; GFX9-W64-NEXT: s_mov_b32 s27, s9 +; GFX9-W64-NEXT: s_mov_b32 s26, s8 +; GFX9-W64-NEXT: s_mov_b32 s25, s7 +; GFX9-W64-NEXT: s_mov_b32 s24, s6 +; GFX9-W64-NEXT: s_mov_b32 s18, s16 +; GFX9-W64-NEXT: s_mov_b32 s17, s15 +; GFX9-W64-NEXT: s_mov_b32 s16, s14 +; GFX9-W64-NEXT: s_mov_b32 s15, s13 +; GFX9-W64-NEXT: s_mov_b32 s14, s12 +; GFX9-W64-NEXT: s_mov_b32 s13, s11 +; GFX9-W64-NEXT: s_mov_b32 s12, s10 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-W64-NEXT: s_mov_b64 exec, s[30:31] +; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[20:23], 0 idxen +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-W64-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-W64-NEXT: buffer_load_dword v3, v3, s[24:27], 0 idxen +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_waitcnt vmcnt(1) +; GFX9-W64-NEXT: v_add_f32_e32 v2, v2, v2 +; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[28:29] +; GFX9-W64-NEXT: image_sample v0, v0, s[12:19], s[20:23] dmask:0x1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[20:23], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wqm_strict_wwm_wqm: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s28, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s19, s17 +; GFX10-W32-NEXT: s_mov_b32 s29, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s23, s5 +; GFX10-W32-NEXT: s_mov_b32 s22, s4 +; GFX10-W32-NEXT: s_mov_b32 s21, s3 +; GFX10-W32-NEXT: s_mov_b32 s20, s2 +; GFX10-W32-NEXT: s_mov_b32 s27, s9 +; GFX10-W32-NEXT: s_mov_b32 s26, s8 +; GFX10-W32-NEXT: s_mov_b32 s25, s7 +; GFX10-W32-NEXT: s_mov_b32 s24, s6 +; GFX10-W32-NEXT: s_mov_b32 s18, s16 +; GFX10-W32-NEXT: s_mov_b32 s17, s15 +; GFX10-W32-NEXT: s_mov_b32 s16, s14 +; GFX10-W32-NEXT: s_mov_b32 s15, s13 +; GFX10-W32-NEXT: s_mov_b32 s14, s12 +; GFX10-W32-NEXT: s_mov_b32 s13, s11 +; GFX10-W32-NEXT: s_mov_b32 s12, s10 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s29 +; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[20:23], 0 idxen +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-W32-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-W32-NEXT: buffer_load_dword v3, v3, s[24:27], 0 idxen +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_waitcnt vmcnt(1) +; GFX10-W32-NEXT: v_add_f32_e32 v2, v2, v2 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-W32-NEXT: image_sample v0, v0, s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen +; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[20:23], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) @@ -1352,25 +3109,101 @@ main_body: ret float %out } -;CHECK-LABEL: {{^}}test_strict_wwm_strict_wqm_wqm: -;CHECK: buffer_store_dword - -;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 -;CHECK: buffer_load_dword -;CHECK: s_mov_b64 exec, [[ORIG]] - -;CHECK: s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec -;CHECK: s_wqm_b64 exec, exec -;CHECK: buffer_load_dword -;CHECK: s_mov_b64 exec, [[ORIG2]] - -;CHECK: s_or_saveexec_b64 [[ORIG3:s\[[0-9]+:[0-9]+\]]], -1 -;CHECK: v_add -;CHECK: s_mov_b64 exec, [[ORIG3]] - -;CHECK: s_wqm_b64 exec, exec -;CHECK: image_sample define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) { +; GFX9-W64-LABEL: test_strict_wwm_strict_wqm_wqm: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[20:21], exec +; GFX9-W64-NEXT: s_mov_b32 s15, s13 +; GFX9-W64-NEXT: s_mov_b64 s[22:23], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_mov_b32 s19, s5 +; GFX9-W64-NEXT: s_mov_b32 s18, s4 +; GFX9-W64-NEXT: s_mov_b32 s17, s3 +; GFX9-W64-NEXT: s_mov_b32 s16, s2 +; GFX9-W64-NEXT: s_mov_b32 s14, s12 +; GFX9-W64-NEXT: s_mov_b32 s13, s11 +; GFX9-W64-NEXT: s_mov_b32 s12, s10 +; GFX9-W64-NEXT: s_mov_b32 s11, s9 +; GFX9-W64-NEXT: s_mov_b32 s10, s8 +; GFX9-W64-NEXT: s_mov_b32 s9, s7 +; GFX9-W64-NEXT: s_mov_b32 s8, s6 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen +; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[16:19], 0 idxen +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[16:19], 0 idxen +; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(1) +; GFX9-W64-NEXT: v_add_f32_e32 v2, v2, v2 +; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[20:21] +; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_strict_wwm_strict_wqm_wqm: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s20, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s15, s13 +; GFX10-W32-NEXT: s_mov_b32 s21, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s19, s5 +; GFX10-W32-NEXT: s_mov_b32 s18, s4 +; GFX10-W32-NEXT: s_mov_b32 s17, s3 +; GFX10-W32-NEXT: s_mov_b32 s16, s2 +; GFX10-W32-NEXT: s_mov_b32 s14, s12 +; GFX10-W32-NEXT: s_mov_b32 s13, s11 +; GFX10-W32-NEXT: s_mov_b32 s12, s10 +; GFX10-W32-NEXT: s_mov_b32 s11, s9 +; GFX10-W32-NEXT: s_mov_b32 s10, s8 +; GFX10-W32-NEXT: s_mov_b32 s9, s7 +; GFX10-W32-NEXT: s_mov_b32 s8, s6 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s21 +; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen +; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[16:19], 0 idxen +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: buffer_load_dword v3, v1, s[16:19], 0 idxen +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: s_waitcnt vmcnt(1) +; GFX10-W32-NEXT: v_add_f32_e32 v2, v2, v2 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20 +; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen +; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) @@ -1386,20 +3219,96 @@ main_body: ret float %out } -;CHECK-LABEL: {{^}}test_wqm_strict_wqm_wqm: -;CHECK: buffer_store_dword - -;CHECK: s_wqm_b64 exec, exec - ;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again. -;CHECK: s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec -;CHECK: s_wqm_b64 exec, exec -;CHECK: buffer_load_dword -;CHECK: s_mov_b64 exec, [[ORIG2]] - -;CHECK: image_sample - define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) { +; GFX9-W64-LABEL: test_wqm_strict_wqm_wqm: +; GFX9-W64: ; %bb.0: ; %main_body +; GFX9-W64-NEXT: s_mov_b64 s[20:21], exec +; GFX9-W64-NEXT: s_mov_b32 s15, s13 +; GFX9-W64-NEXT: s_mov_b64 s[22:23], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_mov_b32 s19, s5 +; GFX9-W64-NEXT: s_mov_b32 s18, s4 +; GFX9-W64-NEXT: s_mov_b32 s17, s3 +; GFX9-W64-NEXT: s_mov_b32 s16, s2 +; GFX9-W64-NEXT: s_mov_b32 s14, s12 +; GFX9-W64-NEXT: s_mov_b32 s13, s11 +; GFX9-W64-NEXT: s_mov_b32 s12, s10 +; GFX9-W64-NEXT: s_mov_b32 s11, s9 +; GFX9-W64-NEXT: s_mov_b32 s10, s8 +; GFX9-W64-NEXT: s_mov_b32 s9, s7 +; GFX9-W64-NEXT: s_mov_b32 s8, s6 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen +; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: s_waitcnt vmcnt(1) +; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(1) +; GFX9-W64-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[20:21] +; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: ; return to shader part epilog +; +; GFX10-W32-LABEL: test_wqm_strict_wqm_wqm: +; GFX10-W32: ; %bb.0: ; %main_body +; GFX10-W32-NEXT: s_mov_b32 s20, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s15, s13 +; GFX10-W32-NEXT: s_mov_b32 s21, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s19, s5 +; GFX10-W32-NEXT: s_mov_b32 s18, s4 +; GFX10-W32-NEXT: s_mov_b32 s17, s3 +; GFX10-W32-NEXT: s_mov_b32 s16, s2 +; GFX10-W32-NEXT: s_mov_b32 s14, s12 +; GFX10-W32-NEXT: s_mov_b32 s13, s11 +; GFX10-W32-NEXT: s_mov_b32 s12, s10 +; GFX10-W32-NEXT: s_mov_b32 s11, s9 +; GFX10-W32-NEXT: s_mov_b32 s10, s8 +; GFX10-W32-NEXT: s_mov_b32 s9, s7 +; GFX10-W32-NEXT: s_mov_b32 s8, s6 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s21 +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20 +; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_waitcnt vmcnt(1) +; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20 +; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen +; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: ; return to shader part epilog main_body: call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) -- 2.7.4