From defce20cbb774ebc818a15445bc21a38739afad6 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Mon, 11 Apr 2022 16:11:43 +0100 Subject: [PATCH] [AMDGPU] Add a test for flat scratch SVS addressing --- llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll | 307 +++++++++++++++++++++++++++ 1 file changed, 307 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll new file mode 100644 index 0000000..22bfd0b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -0,0 +1,307 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx940 < %s | FileCheck %s -check-prefix=GFX940 + +; Test flat scratch SVS addressing mode with various combinations of alignment +; of soffset, voffset and inst_offset. + +declare i32 @llvm.amdgcn.workitem.id.x() + +define amdgpu_kernel void @soff1_voff1(i32 %soff) { +; GFX940-LABEL: soff1_voff1: +; GFX940: ; %bb.0: ; %bb +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v1, 1 +; GFX940-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_add_i32 s0, s0, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_endpgm +bb: + %soff1 = mul i32 %soff, 1 + %a = alloca i8, i32 64, align 4, addrspace(5) + %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff1 + %voff = call i32 @llvm.amdgcn.workitem.id.x() + %voff1 = mul i32 %voff, 1 + %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff1 + %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1 + store volatile i8 1, i8 addrspace(5)* %p1 + %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2 + store volatile i8 2, i8 addrspace(5)* %p2 + %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4 + store volatile i8 4, i8 addrspace(5)* %p4 + ret void +} + +define amdgpu_kernel void @soff1_voff2(i32 %soff) { +; GFX940-LABEL: soff1_voff2: +; GFX940: ; %bb.0: ; %bb +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v1, 1 +; GFX940-NEXT: v_mul_u32_u24_e32 v0, 2, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_add_i32 s0, s0, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_endpgm +bb: + %soff1 = mul i32 %soff, 1 + %a = alloca i8, i32 64, align 4, addrspace(5) + %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff1 + %voff = call i32 @llvm.amdgcn.workitem.id.x() + %voff2 = mul i32 %voff, 2 + %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff2 + %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1 + store volatile i8 1, i8 addrspace(5)* %p1 + %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2 + store volatile i8 2, i8 addrspace(5)* %p2 + %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4 + store volatile i8 4, i8 addrspace(5)* %p4 + ret void +} + +define amdgpu_kernel void @soff1_voff4(i32 %soff) { +; GFX940-LABEL: soff1_voff4: +; GFX940: ; %bb.0: ; %bb +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v1, 1 +; GFX940-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_add_i32 s0, s0, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_endpgm +bb: + %soff1 = mul i32 %soff, 1 + %a = alloca i8, i32 64, align 4, addrspace(5) + %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff1 + %voff = call i32 @llvm.amdgcn.workitem.id.x() + %voff4 = mul i32 %voff, 4 + %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff4 + %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1 + store volatile i8 1, i8 addrspace(5)* %p1 + %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2 + store volatile i8 2, i8 addrspace(5)* %p2 + %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4 + store volatile i8 4, i8 addrspace(5)* %p4 + ret void +} + +define amdgpu_kernel void @soff2_voff1(i32 %soff) { +; GFX940-LABEL: soff2_voff1: +; GFX940: ; %bb.0: ; %bb +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v1, 1 +; GFX940-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_lshl_b32 s0, s0, 1 +; GFX940-NEXT: s_add_i32 s0, s0, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_endpgm +bb: + %soff2 = mul i32 %soff, 2 + %a = alloca i8, i32 64, align 4, addrspace(5) + %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff2 + %voff = call i32 @llvm.amdgcn.workitem.id.x() + %voff1 = mul i32 %voff, 1 + %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff1 + %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1 + store volatile i8 1, i8 addrspace(5)* %p1 + %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2 + store volatile i8 2, i8 addrspace(5)* %p2 + %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4 + store volatile i8 4, i8 addrspace(5)* %p4 + ret void +} + +define amdgpu_kernel void @soff2_voff2(i32 %soff) { +; GFX940-LABEL: soff2_voff2: +; GFX940: ; %bb.0: ; %bb +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v1, 1 +; GFX940-NEXT: v_mul_u32_u24_e32 v0, 2, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_lshl_b32 s0, s0, 1 +; GFX940-NEXT: s_add_i32 s0, s0, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_endpgm +bb: + %soff2 = mul i32 %soff, 2 + %a = alloca i8, i32 64, align 4, addrspace(5) + %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff2 + %voff = call i32 @llvm.amdgcn.workitem.id.x() + %voff2 = mul i32 %voff, 2 + %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff2 + %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1 + store volatile i8 1, i8 addrspace(5)* %p1 + %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2 + store volatile i8 2, i8 addrspace(5)* %p2 + %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4 + store volatile i8 4, i8 addrspace(5)* %p4 + ret void +} + +define amdgpu_kernel void @soff2_voff4(i32 %soff) { +; GFX940-LABEL: soff2_voff4: +; GFX940: ; %bb.0: ; %bb +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v1, 1 +; GFX940-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_lshl_b32 s0, s0, 1 +; GFX940-NEXT: s_add_i32 s0, s0, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_endpgm +bb: + %soff2 = mul i32 %soff, 2 + %a = alloca i8, i32 64, align 4, addrspace(5) + %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff2 + %voff = call i32 @llvm.amdgcn.workitem.id.x() + %voff4 = mul i32 %voff, 4 + %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff4 + %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1 + store volatile i8 1, i8 addrspace(5)* %p1 + %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2 + store volatile i8 2, i8 addrspace(5)* %p2 + %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4 + store volatile i8 4, i8 addrspace(5)* %p4 + ret void +} + +define amdgpu_kernel void @soff4_voff1(i32 %soff) { +; GFX940-LABEL: soff4_voff1: +; GFX940: ; %bb.0: ; %bb +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v1, 1 +; GFX940-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_lshl_b32 s0, s0, 2 +; GFX940-NEXT: s_add_i32 s0, s0, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_endpgm +bb: + %soff4 = mul i32 %soff, 4 + %a = alloca i8, i32 64, align 4, addrspace(5) + %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff4 + %voff = call i32 @llvm.amdgcn.workitem.id.x() + %voff1 = mul i32 %voff, 1 + %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff1 + %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1 + store volatile i8 1, i8 addrspace(5)* %p1 + %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2 + store volatile i8 2, i8 addrspace(5)* %p2 + %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4 + store volatile i8 4, i8 addrspace(5)* %p4 + ret void +} + +define amdgpu_kernel void @soff4_voff2(i32 %soff) { +; GFX940-LABEL: soff4_voff2: +; GFX940: ; %bb.0: ; %bb +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v1, 1 +; GFX940-NEXT: v_mul_u32_u24_e32 v0, 2, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_lshl_b32 s0, s0, 2 +; GFX940-NEXT: s_add_i32 s0, s0, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_endpgm +bb: + %soff4 = mul i32 %soff, 4 + %a = alloca i8, i32 64, align 4, addrspace(5) + %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff4 + %voff = call i32 @llvm.amdgcn.workitem.id.x() + %voff2 = mul i32 %voff, 2 + %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff2 + %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1 + store volatile i8 1, i8 addrspace(5)* %p1 + %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2 + store volatile i8 2, i8 addrspace(5)* %p2 + %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4 + store volatile i8 4, i8 addrspace(5)* %p4 + ret void +} + +define amdgpu_kernel void @soff4_voff4(i32 %soff) { +; GFX940-LABEL: soff4_voff4: +; GFX940: ; %bb.0: ; %bb +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v1, 1 +; GFX940-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_lshl_b32 s0, s0, 2 +; GFX940-NEXT: s_add_i32 s0, s0, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:1 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: scratch_store_byte v0, v2, s0 offset:2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-NEXT: scratch_store_byte v0, v1, s0 offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_endpgm +bb: + %soff4 = mul i32 %soff, 4 + %a = alloca i8, i32 64, align 4, addrspace(5) + %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff4 + %voff = call i32 @llvm.amdgcn.workitem.id.x() + %voff4 = mul i32 %voff, 4 + %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff4 + %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1 + store volatile i8 1, i8 addrspace(5)* %p1 + %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2 + store volatile i8 2, i8 addrspace(5)* %p2 + %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4 + store volatile i8 4, i8 addrspace(5)* %p4 + ret void +} -- 2.7.4