From efd0d6626943d0a9eef638961915ab37bee9ef87 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 17 Oct 2022 14:52:33 +0100 Subject: [PATCH] [AMDGPU] Add regression test cases reported on D136042 --- llvm/test/CodeGen/AMDGPU/bfe-patterns.ll | 136 +++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll index f7c2d69..1c72c08 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -398,6 +398,142 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, ret void } +define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) { +; SI-LABEL: s_sbfe_or_shl_shl_uniform_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s2, s[6:7], 0x0 +; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_or_b32 s0, s2, s0 +; SI-NEXT: s_bfe_i32 s0, s0, 0xf0000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_sbfe_or_shl_shl_uniform_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[0:1], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_or_b32 s0, s2, s0 +; VI-NEXT: s_bfe_i32 s0, s0, 0xf0000 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm + %a0 = load i32, i32 addrspace(1) * %in0 + %b0 = load i32, i32 addrspace(1) * %in1 + %a1 = shl i32 %a0, 17 + %b1 = shl i32 %b0, 17 + %or = or i32 %a1, %b1 + %result = ashr i32 %or, 17 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; TODO ashr(or(shl(x,c1),shl(y,c2)),c1) -> sign_extend_inreg(or(x,shl(y,c2-c1))) iff c2 >= c1 +define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x, i32 addrspace(1)* %y) { +; SI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s2, s[6:7], 0x0 +; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s1, s2, 17 +; SI-NEXT: s_lshl_b32 s0, s0, 19 +; SI-NEXT: s_or_b32 s0, s1, s0 +; SI-NEXT: s_ashr_i32 s0, s0, 17 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[0:1], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s1, s2, 17 +; VI-NEXT: s_lshl_b32 s0, s0, 19 +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_ashr_i32 s0, s0, 17 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm + %a0 = load i32, i32 addrspace(1) * %x + %b0 = load i32, i32 addrspace(1) * %y + %a1 = shl i32 %a0, 17 + %b1 = shl i32 %b0, 19 + %or = or i32 %a1, %b1 + %result = ashr i32 %or, 17 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; Don't fold as 'other shl' amount is less than the sign_extend_inreg type. +define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x, i32 addrspace(1)* %y) { +; SI-LABEL: s_sbfe_or_shl_shl_toosmall_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s2, s[6:7], 0x0 +; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s1, s2, 17 +; SI-NEXT: s_lshl_b32 s0, s0, 16 +; SI-NEXT: s_or_b32 s0, s1, s0 +; SI-NEXT: s_ashr_i32 s0, s0, 17 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_sbfe_or_shl_shl_toosmall_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[0:1], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s1, s2, 17 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_ashr_i32 s0, s0, 17 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm + %a0 = load i32, i32 addrspace(1) * %x + %b0 = load i32, i32 addrspace(1) * %y + %a1 = shl i32 %a0, 17 + %b1 = shl i32 %b0, 16 + %or = or i32 %a1, %b1 + %result = ashr i32 %or, 17 + store i32 %result, i32 addrspace(1)* %out + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone } -- 2.7.4