From 20ca49b646b73619b05d1a6908c5ab3416f53f97 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 16 Jan 2020 11:34:19 -0500 Subject: [PATCH] AMDGPU: Update tests to use modern buffer intrinsics --- llvm/test/CodeGen/AMDGPU/amdpal.ll | 4 +- llvm/test/CodeGen/AMDGPU/else.ll | 6 +- llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll | 4 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll | 44 +++---- llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll | 12 +- .../test/CodeGen/AMDGPU/mubuf-legalize-operands.ll | 14 +-- llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll | 12 +- .../AMDGPU/uniform-branch-intrinsic-cond.ll | 5 +- .../vgpr-descriptor-waterfall-loop-idom-update.ll | 7 +- .../AMDGPU/vgpr-spill-emergency-stack-slot.ll | 4 +- llvm/test/CodeGen/AMDGPU/wait.ll | 10 +- llvm/test/CodeGen/AMDGPU/wave32.ll | 8 +- llvm/test/CodeGen/AMDGPU/wqm.ll | 139 +++++++++++---------- 13 files changed, 137 insertions(+), 132 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/amdpal.ll b/llvm/test/CodeGen/AMDGPU/amdpal.ll index c6d082d..7d6f010 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal.ll @@ -77,13 +77,13 @@ entry: %e = getelementptr [2 x i32], [2 x i32] addrspace(5)* %v1a, i32 0, i32 %idx %x = load i32, i32 addrspace(5)* %e %xf = bitcast i32 %x to float - call void @llvm.amdgcn.buffer.store.f32(float %xf, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %xf, <4 x i32> undef, i32 0, i32 0, i32 0) ret void } attributes #0 = { nounwind "amdgpu-git-ptr-high"="0x1234" } -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) +declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) ; Check we have CS_NUM_USED_VGPRS in PAL metadata. diff --git a/llvm/test/CodeGen/AMDGPU/else.ll b/llvm/test/CodeGen/AMDGPU/else.ll index e641f42..479d205 100644 --- a/llvm/test/CodeGen/AMDGPU/else.ll +++ b/llvm/test/CodeGen/AMDGPU/else.ll @@ -49,12 +49,12 @@ else: end: %r = phi float [ %v.if, %if ], [ %v.else, %else ] - call void @llvm.amdgcn.buffer.store.f32(float %r, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %r, <4 x i32> undef, i32 0, i32 0, i32 0) ret void } -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 +declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 immarg, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #2 attributes #0 = { nounwind } attributes #1 = { nounwind writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll index 51d1c09..f11b44d 100644 --- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -29,7 +29,7 @@ for.body: br i1 %cc, label %mid.loop, label %for.end mid.loop: - %v = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %tid, i32 %i, i1 false, i1 false) + %v = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %tid, i32 %i, i32 0, i32 0) %cc2 = fcmp oge float %v, 0.0 br i1 %cc2, label %end.loop, label %for.end @@ -48,7 +48,7 @@ end: ret void } -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 +declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll index 3d35e32..6a920d8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll @@ -9,8 +9,8 @@ ;CHECK: v_add_f32_e32 define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) { main_body: - %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) - %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) ret float %out.0 @@ -25,8 +25,8 @@ main_body: ;CHECK: v_add_f32_e32 define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) { main_body: - %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) - %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %out.0 = bitcast float %out to i32 %out.1 = call i32 @llvm.amdgcn.softwqm.i32(i32 %out.0) @@ -45,10 +45,10 @@ main_body: ;CHECK: v_add_f32_e32 define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) { main_body: - %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) - %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %temp = fadd float %src0, %src1 - call void @llvm.amdgcn.buffer.store.f32(float %temp, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %temp, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %out = fadd float %temp, %temp %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) ret float %out.0 @@ -67,11 +67,11 @@ main_body: ;CHECK: v_add_f32_e32 define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) { main_body: - %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) - %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %temp = fadd float %src0, %src1 %temp.0 = call float @llvm.amdgcn.wqm.f32(float %temp) - call void @llvm.amdgcn.buffer.store.f32(float %temp.0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %temp.0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %out = fadd float %temp, %temp %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) ret float %out.0 @@ -89,9 +89,9 @@ main_body: ;CHECK-NOT: s_wqm_b64 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { main_body: - %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) - %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %temp = fadd float %src0, %src1 %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp) %out = fadd float %temp.0, %temp.0 @@ -115,14 +115,14 @@ main_body: br i1 %cmp, label %IF, label %ELSE IF: - %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) - %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %data.if = call float @llvm.amdgcn.softwqm.f32(float %out) br label %END ELSE: - call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) br label %END END: @@ -157,14 +157,14 @@ main_body: br i1 %cmp, label %IF, label %ELSE IF: - %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) - %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %data.if = call float @llvm.amdgcn.softwqm.f32(float %out) br label %END ELSE: - call void @llvm.amdgcn.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) br label %END END: @@ -172,9 +172,9 @@ END: ret float %r } -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2 -declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2 -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3 +declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2 +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2 +declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) #3 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 declare void @llvm.amdgcn.kill(i1) #1 diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll index f374276..e60f0dd 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll @@ -22,7 +22,7 @@ loop: br i1 %tmp27, label %then, label %endif then: ; preds = %bb - call void @llvm.amdgcn.buffer.store.f32(float undef, <4 x i32> undef, i32 0, i32 undef, i1 false, i1 false) #1 + call void @llvm.amdgcn.raw.buffer.store.f32(float undef, <4 x i32> undef, i32 0, i32 undef, i32 0) br label %endif endif: ; preds = %bb28, %bb @@ -49,7 +49,7 @@ loop: %tmp23phi = phi i32 [ %tmp23, %loop ], [ 0, %entry ] %tmp23 = add nuw i32 %tmp23phi, 1 %tmp27 = icmp ult i32 %arg, %tmp23 - call void @llvm.amdgcn.buffer.store.f32(float undef, <4 x i32> undef, i32 0, i32 undef, i1 false, i1 false) #1 + call void @llvm.amdgcn.raw.buffer.store.f32(float undef, <4 x i32> undef, i32 0, i32 undef, i32 0) br i1 %tmp27, label %loop, label %loopexit loopexit: @@ -76,7 +76,7 @@ loop: br i1 %tmp27, label %then, label %endif then: ; preds = %bb - call void @llvm.amdgcn.buffer.store.f32(float undef, <4 x i32> undef, i32 0, i32 undef, i1 false, i1 false) #1 + call void @llvm.amdgcn.raw.buffer.store.f32(float undef, <4 x i32> undef, i32 0, i32 undef, i32 0) br label %endif endif: ; preds = %bb28, %bb @@ -86,8 +86,6 @@ loopexit: ret void } +declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #3 - -attributes #3 = { nounwind writeonly } - +attributes #0 = { nounwind writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index 3b5d4d5..64206d4 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -42,7 +42,7 @@ ; W32: v_mov_b32_e32 v0, [[RES]] define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { - %call = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %i, i32 %c, i32 0, i1 zeroext false, i1 zeroext false) #1 + %call = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %i, i32 %c, i32 0, i32 0, i32 0) #1 ret float %call } @@ -128,8 +128,8 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, float addrspace(1)* %out0, float addrspace(1)* %out1) #0 { entry: - %val0 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %i, i32 %c, i32 0, i1 zeroext false, i1 zeroext false) #1 - %val1 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %j, i32 %c, i32 0, i1 zeroext false, i1 zeroext false) #1 + %val0 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %i, i32 %c, i32 0, i32 0, i32 0) #1 + %val1 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %j, i32 %c, i32 0, i32 0, i32 0) #1 store volatile float %val0, float addrspace(1)* %out0 store volatile float %val1, float addrspace(1)* %out1 ret void @@ -317,13 +317,13 @@ entry: define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, float addrspace(1)* %in, float addrspace(1)* %out) #0 { entry: %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" () - %val0 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %i, i32 %live.out.reg, i32 0, i1 zeroext false, i1 zeroext false) #1 + %val0 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %i, i32 %live.out.reg, i32 0, i32 0, i32 0) #1 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 %cmp = icmp eq i32 %idx, 0 br i1 %cmp, label %bb1, label %bb2 bb1: - %val1 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %j, i32 %live.out.reg, i32 0, i1 zeroext false, i1 zeroext false) #1 + %val1 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %j, i32 %live.out.reg, i32 0, i32 0, i32 0) #1 br label %bb2 bb2: @@ -333,7 +333,7 @@ bb2: } declare i32 @llvm.amdgcn.workitem.id.x() #1 -declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1 +declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32 immarg) #1 attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } +attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll b/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll index d6fad36..9057bfb 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll @@ -9,7 +9,7 @@ define amdgpu_vs float @test_none(<4 x i32> addrspace(4)* inreg %base, i32 %i) { main_body: %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %base, i32 %i %tmp2 = load <4 x i32>, <4 x i32> addrspace(4)* %ptr, align 32 - %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 0, i32 0, i1 0, i1 0) + %tmp7 = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %tmp2, i32 0, i32 0, i32 0) ret float %tmp7 } @@ -19,7 +19,7 @@ define amdgpu_vs float @test_idxen(<4 x i32> addrspace(4)* inreg %base, i32 %i) main_body: %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %base, i32 %i %tmp2 = load <4 x i32>, <4 x i32> addrspace(4)* %ptr, align 32 - %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 0, i1 0, i1 0) + %tmp7 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 0, i32 0, i32 0) ret float %tmp7 } @@ -29,7 +29,7 @@ define amdgpu_vs float @test_offen(<4 x i32> addrspace(4)* inreg %base, i32 %i) main_body: %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %base, i32 %i %tmp2 = load <4 x i32>, <4 x i32> addrspace(4)* %ptr, align 32 - %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 0, i32 undef, i1 0, i1 0) + %tmp7 = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 0, i32 0) ret float %tmp7 } @@ -39,10 +39,12 @@ define amdgpu_vs float @test_both(<4 x i32> addrspace(4)* inreg %base, i32 %i) { main_body: %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %base, i32 %i %tmp2 = load <4 x i32>, <4 x i32> addrspace(4)* %ptr, align 32 - %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 undef, i1 0, i1 0) + %tmp7 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 undef, i32 0, i32 0) ret float %tmp7 } -declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) nounwind readonly +declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32 immarg) #1 +declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32 immarg) #1 attributes #0 = { nounwind readnone } +attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll b/llvm/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll index eb6007f..e1fd603 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll @@ -8,7 +8,7 @@ ; CHECK: s_cbranch_vccnz define amdgpu_ps float @main(<4 x i32> inreg %rsrc) { main_body: - %v = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 true, i1 false) + %v = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 1) %cc = fcmp une float %v, 1.000000e+00 br i1 %cc, label %if, label %else @@ -22,7 +22,6 @@ else: ret float %r } -; Function Attrs: nounwind readonly -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 +declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32 immarg) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll index d3cb4ec..c3f41a4 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -39,9 +39,10 @@ entry: bb0: %desc = load <4 x i32>, <4 x i32>* %arg, align 8 - tail call void @llvm.amdgcn.buffer.store.f32(float undef, <4 x i32> %desc, i32 0, i32 undef, i1 zeroext false, i1 zeroext false) + tail call void @llvm.amdgcn.raw.buffer.store.f32(float undef, <4 x i32> %desc, i32 undef, i32 0, i32 0) br label %bb0 } -; Function Attrs: nounwind writeonly -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1 immarg, i1 immarg) +declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 + +attributes #0 = { nounwind writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index 4a94ce1..b1ad28a 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -38,7 +38,7 @@ bb: %tmp16 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp15, align 16, !tbaa !0 %tmp17 = add i32 %arg5, %arg7 %tmp16.cast = bitcast <4 x i32> %tmp16 to <4 x i32> - %tmp18 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp16.cast, i32 %tmp17, i32 0, i1 false, i1 false) + %tmp18 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp16.cast, i32 %tmp17, i32 0, i32 0, i32 0) %tmp19 = extractelement <4 x float> %tmp18, i32 0 %tmp20 = extractelement <4 x float> %tmp18, i32 1 %tmp21 = extractelement <4 x float> %tmp18, i32 2 @@ -489,7 +489,7 @@ declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32 immarg) #2 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/wait.ll b/llvm/test/CodeGen/AMDGPU/wait.ll index c06c16e..dcc7bfa 100644 --- a/llvm/test/CodeGen/AMDGPU/wait.ll +++ b/llvm/test/CodeGen/AMDGPU/wait.ll @@ -20,7 +20,7 @@ main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(4)* %arg3, i32 0 %tmp10 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp, !tbaa !0 %tmp10.cast = bitcast <16 x i8> %tmp10 to <4 x i32> - %tmp11 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp10.cast, i32 %arg6, i32 0, i1 false, i1 false) + %tmp11 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp10.cast, i32 %arg6, i32 0, i32 0, i32 0) %tmp12 = extractelement <4 x float> %tmp11, i32 0 %tmp13 = extractelement <4 x float> %tmp11, i32 1 call void @llvm.amdgcn.s.barrier() #1 @@ -29,7 +29,7 @@ main_body: %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(4)* %arg3, i32 1 %tmp17 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp16, !tbaa !0 %tmp17.cast = bitcast <16 x i8> %tmp17 to <4 x i32> - %tmp18 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp17.cast, i32 %arg6, i32 0, i1 false, i1 false) + %tmp18 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp17.cast, i32 %arg6, i32 0, i32 0, i32 0) %tmp19 = extractelement <4 x float> %tmp18, i32 0 %tmp20 = extractelement <4 x float> %tmp18, i32 1 %tmp21 = extractelement <4 x float> %tmp18, i32 2 @@ -56,7 +56,7 @@ main_body: %tmp11 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp, align 16, !tbaa !0 %tmp12 = add i32 %arg5, %arg7 %tmp11.cast = bitcast <16 x i8> %tmp11 to <4 x i32> - %tmp13 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp11.cast, i32 %tmp12, i32 0, i1 false, i1 false) + %tmp13 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp11.cast, i32 %tmp12, i32 0, i32 0, i32 0) %tmp14 = extractelement <4 x float> %tmp13, i32 0 %tmp15 = extractelement <4 x float> %tmp13, i32 1 %tmp16 = extractelement <4 x float> %tmp13, i32 2 @@ -65,7 +65,7 @@ main_body: %tmp19 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp18, align 16, !tbaa !0 %tmp20 = add i32 %arg5, %arg7 %tmp19.cast = bitcast <16 x i8> %tmp19 to <4 x i32> - %tmp21 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp19.cast, i32 %tmp20, i32 0, i1 false, i1 false) + %tmp21 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp19.cast, i32 %tmp20, i32 0, i32 0, i32 0) %tmp22 = extractelement <4 x float> %tmp21, i32 0 %tmp23 = extractelement <4 x float> %tmp21, i32 1 %tmp24 = extractelement <4 x float> %tmp21, i32 2 @@ -76,7 +76,7 @@ main_body: } declare void @llvm.amdgcn.s.barrier() #1 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32 immarg) #2 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index b05a8ac..9031428 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -721,7 +721,7 @@ main_body: br i1 %cc, label %endif, label %if if: - %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) + %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) %out = fadd float %src, %src %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) %out.1 = fadd float %src, %out.0 @@ -758,8 +758,8 @@ main_body: ; GFX1064: s_and_b64 exec, exec, s[{{[0-9:]+}}] define amdgpu_ps float @test_wqm2(i32 inreg %idx0, i32 inreg %idx1) #0 { main_body: - %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) - %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %out.0 = bitcast float %out to i32 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) @@ -1108,7 +1108,7 @@ declare float @llvm.amdgcn.wwm.f32(float) declare i32 @llvm.amdgcn.wqm.i32(i32) declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) +declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) declare i64 @llvm.amdgcn.fcmp.i64.f32(float, float, i32) diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index f3a44a7..1026d63 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -32,7 +32,7 @@ main_body: %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) - %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 ret <4 x float> %tex } @@ -49,11 +49,11 @@ main_body: ;CHECK: .size test3 define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex.1 = bitcast <4 x float> %tex to <4 x i32> %tex.2 = extractelement <4 x i32> %tex.1, i32 0 - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i32 0, i32 0) ret <4 x float> %tex } @@ -77,7 +77,7 @@ main_body: %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) - %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex.0 = extractelement <4 x float> %tex, i32 0 %tex.1 = extractelement <4 x float> %tex, i32 1 %tex.2 = extractelement <4 x float> %tex, i32 2 @@ -102,11 +102,11 @@ define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp main_body: %c.1 = mul i32 %c, %d - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i32 0, i32 0) %c.1.bc = bitcast i32 %c.1 to float - %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 - %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 ret <4 x float> %dtex } @@ -122,8 +122,8 @@ main_body: ;CHECK-NOT: v_mov_b32_e32 define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) { main_body: - %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) - %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) ret float %out.0 @@ -138,8 +138,8 @@ main_body: ;CHECK: v_add_f32_e32 define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) { main_body: - %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) - %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %out.0 = bitcast float %out to i32 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) @@ -156,8 +156,8 @@ main_body: ;CHECK: v_add_f32_e32 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { main_body: - %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) - %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) ret float %out.0 @@ -172,8 +172,8 @@ main_body: ;CHECK: v_add_{{[iu]}}32_e32 define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) { main_body: - %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) - %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %src0.0 = bitcast float %src0 to i32 %src1.0 = bitcast float %src1 to i32 %out = add i32 %src0.0, %src1.0 @@ -201,7 +201,7 @@ main_body: br i1 %cc, label %endif, label %if if: - %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) + %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) %out = fadd float %src, %src %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) %out.1 = fadd float %src, %out.0 @@ -230,7 +230,7 @@ main_body: br i1 %cc, label %endif, label %if if: - %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) + %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) %out = fadd float %src, %src %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) br label %endif @@ -252,9 +252,9 @@ endif: ;CHECK: s_wqm_b64 exec, exec define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) { main_body: - %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) - %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %temp = fadd float %src1, %src1 %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp) %out = fadd float %temp.0, %temp.0 @@ -341,13 +341,13 @@ endloop: ;CHECK: v_add_{{[iu]}}32_e32 define amdgpu_ps void @test_set_inactive1(i32 inreg %idx) { main_body: - %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) + %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) %src.0 = bitcast float %src to i32 %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0) %out = add i32 %src.1, %src.1 %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) %out.1 = bitcast i32 %out.0 to float - call void @llvm.amdgcn.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) ret void } @@ -359,15 +359,15 @@ main_body: ;CHECK: buffer_load_dword define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) { main_body: - %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %src1.0 = bitcast float %src1 to i32 %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef) - %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src0.0 = bitcast float %src0 to i32 %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0) %out = add i32 %src0.1, %src1.1 %out.0 = bitcast i32 %out to float - call void @llvm.amdgcn.buffer.store.f32(float %out.0, <4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %out.0, <4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) ret void } @@ -396,14 +396,14 @@ main_body: IF: %c.bc = bitcast i32 %c to float - %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 - %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %data.if = extractelement <4 x float> %dtex, i32 0 br label %END ELSE: - call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) br label %END END: @@ -439,14 +439,14 @@ main_body: IF: %c.bc = bitcast i32 %c to float - %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 - %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %data.if = extractelement <4 x float> %dtex, i32 0 br label %END ELSE: - call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) br label %END END: @@ -472,16 +472,16 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3 main_body: %idx.1 = extractelement <3 x i32> %idx, i32 0 %data.1 = extractelement <2 x float> %data, i32 0 - call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0) ; The load that determines the branch (and should therefore be WQM) is ; surrounded by stores that require disabled WQM. %idx.2 = extractelement <3 x i32> %idx, i32 1 - %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0) + %z = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i32 0, i32 0) %idx.3 = extractelement <3 x i32> %idx, i32 2 %data.3 = extractelement <2 x float> %data, i32 1 - call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i32 0, i32 0) %cc = fcmp ogt float %z, 0.0 br i1 %cc, label %IF, label %ELSE @@ -497,7 +497,7 @@ ELSE: END: %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ] %coord.END.bc = bitcast i32 %coord.END to float - %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 ret <4 x float> %tex } @@ -514,11 +514,11 @@ END: ;CHECK-DAG: store define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 - %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %dtex.1 = extractelement <4 x float> %dtex, i32 0 - call void @llvm.amdgcn.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) %cc = fcmp ogt float %dtex.1, 0.0 br i1 %cc, label %IF, label %ELSE @@ -556,14 +556,14 @@ main_body: br i1 %cond, label %IF, label %END IF: - %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0) + %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0) br label %END END: - %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 - %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 ret <4 x float> %dtex } @@ -584,20 +584,20 @@ END: ;CHECK: image_sample define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %idx.0 = extractelement <2 x i32> %idx, i32 0 %data.0 = extractelement <2 x float> %data, i32 0 - call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i32 0, i32 0) %z.cmp = fcmp olt float %z, 0.0 call void @llvm.amdgcn.kill(i1 %z.cmp) %idx.1 = extractelement <2 x i32> %idx, i32 1 %data.1 = extractelement <2 x float> %data, i32 1 - call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) - %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0) + %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex2.0 = extractelement <4 x float> %tex2, i32 0 - %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %out = fadd <4 x float> %tex, %dtex ret <4 x float> %out @@ -617,11 +617,11 @@ main_body: ; CHECK: v_cmpx_ define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 - %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 - call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0) %z.cmp = fcmp olt float %z, 0.0 call void @llvm.amdgcn.kill(i1 %z.cmp) @@ -675,7 +675,7 @@ loop: body: %c.iv0 = extractelement <4 x float> %c.iv, i32 0 - %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 + %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 %ctr.next = fadd float %ctr.iv, 2.0 br label %loop @@ -706,18 +706,18 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { entry: %array = alloca [32 x i32], align 4, addrspace(5) - call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0) %s.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 0 store volatile i32 %a, i32 addrspace(5)* %s.gep, align 4 - call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0) %c.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 %idx %c = load i32, i32 addrspace(5)* %c.gep, align 4 %c.bc = bitcast i32 %c to float - %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) + %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i32 0) ret void } @@ -734,9 +734,9 @@ entry: ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK-NOT: exec define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { - %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 - %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 ret <4 x float> %dtex } @@ -748,9 +748,9 @@ define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { ; CHECK-NOT: exec define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind { entry: - %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 - %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 %cc = icmp sgt i32 %c, 0 br i1 %cc, label %if, label %else @@ -782,16 +782,16 @@ main_body: br i1 %cc, label %if, label %else if: - %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 + %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 br label %end else: - %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 + %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 br label %end end: %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ] - call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) ret <4 x float> %r } @@ -806,9 +806,9 @@ end: define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { main_body: %c.bc = bitcast i32 %c to float - %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 - %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %cmp = icmp eq i32 %z, 0 br i1 %cmp, label %IF, label %ENDIF @@ -828,10 +828,15 @@ ENDIF: declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1 -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2 -declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2 + +declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2 +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2 +declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2 +declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2 +declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3 +declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3 + declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3 -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 declare void @llvm.amdgcn.kill(i1) #1 -- 2.7.4