From: Matt Arsenault Date: Fri, 8 Mar 2019 20:30:51 +0000 (+0000) Subject: AMDGPU: Add more tests for d16 loads X-Git-Tag: llvmorg-10-init~10398 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=74c9c305e095e44c311ff15428ccbb1fc6949302;p=platform%2Fupstream%2Fllvm.git AMDGPU: Add more tests for d16 loads Also fix a few cases that weren't testing what they were supposed to. llvm-svn: 355724 --- diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll index f5a1940..ee5737c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -2,6 +2,75 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s +; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lo: +; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ds_read_u16 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16 +; GFX900-NEXT: ds_write_b16 v3, v2 +; GFX900-NEXT: s_waitcnt lgkmcnt(1) +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(i16 addrspace(3)* noalias %in) #0 { +entry: + %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 + %load.lo = load i16, i16 addrspace(3)* %in + %load.hi = load i16, i16 addrspace(3)* %gep + store i16 %load.lo, i16 addrspace(3)* null + %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 + ret <2 x i16> %build1 +} + +; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_hi: +; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ds_read_u16 v1, v0 +; GFX900-NEXT: ds_read_u16 v0, v0 offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(1) +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: ds_write_b16 v2, v0 +; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias %in) #0 { +entry: + %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 + %load.lo = load i16, i16 addrspace(3)* %in + %load.hi = load i16, i16 addrspace(3)* %gep + store i16 %load.hi, i16 addrspace(3)* null + %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 + ret <2 x i16> %build1 +} + +; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lohi: +; GFX900: ds_read_u16 v3, v0 +; GFX900-NEXT: ds_read_u16 v0, v0 offset:16 +; GFX900-NEXT: s_waitcnt lgkmcnt(1) +; GFX900-NEXT: ds_write_b16 v1, v3 +; GFX900-NEXT: s_waitcnt lgkmcnt(1) +; GFX900-NEXT: ds_write_b16 v2, v0 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(i16 addrspace(3)* noalias %in, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 { +entry: + %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 + %load.lo = load i16, i16 addrspace(3)* %in + %load.hi = load i16, i16 addrspace(3)* %gep + store i16 %load.lo, i16 addrspace(3)* %out0 + store i16 %load.hi, i16 addrspace(3)* %out1 + %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 + ret <2 x i16> %build1 +} + ; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo: ; GCN: s_waitcnt ; GFX900-NEXT: ds_read_u16_d16_hi v0, v0 @@ -141,6 +210,48 @@ entry: ret void } +; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_zexti8: +; GCN: s_waitcnt +; GFX900-NEXT: ds_read_u8_d16_hi v1, v0 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; NO-D16-HI: ds_read_u8 v +define void @load_local_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 { +entry: + %load = load i8, i8 addrspace(3)* %in + %ext = zext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + + %build0 = insertelement <2 x half> undef, half %reg, i32 0 + %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_sexti8: +; GCN: s_waitcnt +; GFX900-NEXT: ds_read_i8_d16_hi v1, v0 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; NO-D16-HI: ds_read_i8 v +define void @load_local_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 { +entry: + %load = load i8, i8 addrspace(3)* %in + %ext = sext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + + %build0 = insertelement <2 x half> undef, half %reg, i32 0 + %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg: ; GCN: s_waitcnt ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 @@ -211,6 +322,44 @@ entry: ret void } +; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_sexti8: +; GCN: s_waitcnt +; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 +define void @load_global_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, half %reg) #0 { +entry: + %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 + %load = load i8, i8 addrspace(1)* %gep + %ext = sext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + %build0 = insertelement <2 x half> undef, half %reg, i32 0 + %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_zexti8: +; GCN: s_waitcnt +; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 +define void @load_global_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, half %reg) #0 { +entry: + %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 + %load = load i8, i8 addrspace(1)* %gep + %ext = zext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + %build0 = insertelement <2 x half> undef, half %reg, i32 0 + %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + ; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg: ; GCN: s_waitcnt ; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] @@ -297,6 +446,52 @@ entry: ret void } +; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_zexti8: +; GCN: s_waitcnt +; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1] +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v[0:1], v2 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; NO-D16-HI: flat_load_ubyte v{{[0-9]+}} +; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; GFX803: v_or_b32_sdwa +; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, +define void @load_flat_hi_v2f16_reglo_vreg_zexti8(i8* %in, half %reg) #0 { +entry: + %load = load i8, i8* %in + %ext = zext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + %build0 = insertelement <2 x half> undef, half %reg, i32 0 + %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_sexti8: +; GCN: s_waitcnt +; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1] +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v[0:1], v2 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; NO-D16-HI: flat_load_sbyte v{{[0-9]+}} +; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; GFX803: v_or_b32_sdwa +; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, +define void @load_flat_hi_v2f16_reglo_vreg_sexti8(i8* %in, half %reg) #0 { +entry: + %load = load i8, i8* %in + %ext = sext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + %build0 = insertelement <2 x half> undef, half %reg, i32 0 + %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg: ; GCN: s_waitcnt ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}} @@ -391,6 +586,48 @@ entry: ret void } +; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_zexti8: +; GCN: s_waitcnt +; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, half %reg) #0 { +entry: + %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 + %load = load i8, i8 addrspace(5)* %gep + %ext = zext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + %build0 = insertelement <2 x half> undef, half %reg, i32 0 + %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_sexti8: +; GCN: s_waitcnt +; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, half %reg) #0 { +entry: + %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 + %load = load i8, i8 addrspace(5)* %gep + %ext = sext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + %build0 = insertelement <2 x half> undef, half %reg, i32 0 + %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8: ; GCN: s_waitcnt ; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}} @@ -509,6 +746,44 @@ entry: ret void } +; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_sexti8: +; GCN: s_waitcnt +; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 +define void @load_constant_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, half %reg) #0 { +entry: + %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095 + %load = load i8, i8 addrspace(4)* %gep + %ext = sext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + %build0 = insertelement <2 x half> undef, half %reg, i32 0 + %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_zexti8: +; GCN: s_waitcnt +; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 +define void @load_constant_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, half %reg) #0 { +entry: + %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095 + %load = load i8, i8 addrspace(4)* %gep + %ext = zext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + %build0 = insertelement <2 x half> undef, half %reg, i32 0 + %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + ; Local object gives known offset, so requires converting from offen ; to offset variant. @@ -567,7 +842,7 @@ entry: ; FIXME: Remove m0 init and waitcnt between reads ; FIXME: Is there a cost to using the extload over not? -; GCN-LABEL: {{^}}load_local_v2i16_split: +; GCN-LABEL: {{^}}load_local_v2i16_split_multi_chain: ; GCN: s_waitcnt ; GFX900-NEXT: ds_read_u16 v1, v0 ; GFX900-NEXT: s_waitcnt @@ -575,7 +850,7 @@ entry: ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: s_setpc_b64 -define <2 x i16> @load_local_v2i16_split(i16 addrspace(3)* %in) #0 { +define <2 x i16> @load_local_v2i16_split_multi_chain(i16 addrspace(3)* %in) #0 { entry: %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1 %load0 = load volatile i16, i16 addrspace(3)* %in @@ -585,6 +860,45 @@ entry: ret <2 x i16> %build1 } +; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_samechain: +; GFX900: ds_read_u16 v1, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: s_setpc_b64 + +; NO-D16-HI: ds_read_u16 +; NO-D16-HI: ds_read_u16 +define <2 x i16> @load_local_lo_hi_v2i16_samechain(i16 addrspace(3)* %in) #0 { +entry: + %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 + %load.lo = load i16, i16 addrspace(3)* %in + %load.hi = load i16, i16 addrspace(3)* %gep + %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 + ret <2 x i16> %build1 +} + +; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_side_effect: +; GFX900: ds_read_u16 [[LOAD0:v[0-9]+]], v0 +; GFX900: ds_write_b16 +; GFX900: ds_read_u16_d16_hi [[LOAD0]], v0 offset:16 + +; NO-D16-HI: ds_read_u16 +; NO-D16-HI: ds_write_b16 +; NO-D16-HI: ds_read_u16 +define <2 x i16> @load_local_lo_hi_v2i16_side_effect(i16 addrspace(3)* %in, i16 addrspace(3)* %may.alias) #0 { +entry: + %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 + %load.lo = load i16, i16 addrspace(3)* %in + store i16 123, i16 addrspace(3)* %may.alias + %load.hi = load i16, i16 addrspace(3)* %gep + %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 + ret <2 x i16> %build1 +} + ; FIXME: Remove waitcnt between reads ; GCN-LABEL: {{^}}load_global_v2i16_split: ; GCN: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll index b7512f2..5d626ea 100644 --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -1,13 +1,14 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s ; GCN-LABEL: {{^}}load_local_lo_v2i16_undeflo: ; GCN: s_waitcnt -; GFX9-NEXT: ds_read_u16_d16 v0, v0 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: ds_read_u16_d16 v0, v0 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: ds_read_u16 +; NO-D16-HI: ds_read_u16 define <2 x i16> @load_local_lo_v2i16_undeflo(i16 addrspace(3)* %in) #0 { entry: %load = load i16, i16 addrspace(3)* %in @@ -17,15 +18,14 @@ entry: ; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo: ; GCN: s_waitcnt -; GFX9-NEXT: ds_read_u16_d16 v0, v0 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 - -; VI: ds_read_u16 +; GCN: ds_read_u16 v0, v0 +; GFX9: v_and_b32_e32 v0, 0xffff, v0 +; GFX9: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9: s_setpc_b64 define <2 x i16> @load_local_lo_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 { entry: %load = load i16, i16 addrspace(3)* %in - %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0 ret <2 x i16> %build1 } @@ -33,17 +33,15 @@ entry: ; Show that we get reasonable regalloc without physreg constraints. ; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg: ; GCN: s_waitcnt -; GFX9-NEXT: ds_read_u16_d16 v0, v0 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v[0:1], v0, off{{$}} -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 - -; VI: ds_read_u16 +; GCN: ds_read_u16 v0, v0 +; GCN: s_waitcnt +; GFX9: v_and_b32_e32 v0, 0xffff, v0 +; GFX9: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9: global_store_dword v define void @load_local_lo_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 { entry: %load = load i16, i16 addrspace(3)* %in - %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef ret void @@ -51,13 +49,13 @@ entry: ; GCN-LABEL: {{^}}load_local_lo_v2i16_zerolo: ; GCN: s_waitcnt -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: ds_read_u16_d16 v1, v0 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: ds_read_u16_d16 v1, v0 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: s_setpc_b64 -; VI: ds_read_u16 v +; NO-D16-HI: ds_read_u16 v define <2 x i16> @load_local_lo_v2i16_zerolo(i16 addrspace(3)* %in) #0 { entry: %load = load i16, i16 addrspace(3)* %in @@ -67,13 +65,13 @@ entry: ; GCN-LABEL: {{^}}load_local_lo_v2f16_fpimm: ; GCN: s_waitcnt -; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX9-NEXT: ds_read_u16_d16 v1, v0 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX900-NEXT: ds_read_u16_d16 v1, v0 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: s_setpc_b64 -; VI: ds_read_u16 v +; NO-D16-HI: ds_read_u16 v define <2 x half> @load_local_lo_v2f16_fpimm(half addrspace(3)* %in) #0 { entry: %load = load half, half addrspace(3)* %in @@ -83,13 +81,13 @@ entry: ; GCN-LABEL: {{^}}load_local_lo_v2f16_reghi_vreg: ; GCN: s_waitcnt -; GFX9-NEXT: ds_read_u16_d16 v1, v0 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}} -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: ds_read_u16_d16 v1, v0 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: ds_read_u16 v +; NO-D16-HI: ds_read_u16 v define void @load_local_lo_v2f16_reghi_vreg(half addrspace(3)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -98,15 +96,14 @@ entry: store <2 x half> %build1, <2 x half> addrspace(1)* undef ret void } - ; GCN-LABEL: {{^}}load_local_lo_v2f16_reglo_vreg: -; GFX9: ds_read_u16 v -; GFX9: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} -; GFX9: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}} -; GFX9: global_store_dword +; GFX900: ds_read_u16 v +; GFX900: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} +; GFX900: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}} +; GFX900: global_store_dword -; VI: ds_read_u16 v +; NO-D16-HI: ds_read_u16 v define void @load_local_lo_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 { entry: %load = load half, half addrspace(3)* %in @@ -118,13 +115,13 @@ entry: ; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_zexti8: ; GCN: s_waitcnt -; GFX9-NEXT: ds_read_u8_d16 v1, v0 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}} -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: ds_read_u8_d16 v1, v0 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: ds_read_u8 v +; NO-D16-HI: ds_read_u8 v define void @load_local_lo_v2i16_reghi_vreg_zexti8(i8 addrspace(3)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -137,12 +134,12 @@ entry: ; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg_zexti8: ; GCN: s_waitcnt -; GFX9: ds_read_u8 v -; GFX9: global_store_dword -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900: ds_read_u8 v +; GFX900: global_store_dword +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: ds_read_u8 v +; NO-D16-HI: ds_read_u8 v define void @load_local_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 { entry: %load = load i8, i8 addrspace(3)* %in @@ -155,13 +152,13 @@ entry: ; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_sexti8: ; GCN: s_waitcnt -; GFX9-NEXT: ds_read_i8_d16 v1, v0 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}} -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: ds_read_i8_d16 v1, v0 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: ds_read_i8 v +; NO-D16-HI: ds_read_i8 v define void @load_local_lo_v2i16_reghi_vreg_sexti8(i8 addrspace(3)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -174,11 +171,11 @@ entry: ; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg_sexti8: ; GCN: s_waitcnt -; GFX9: ds_read_i8 v -; GFX9: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} -; GFX9: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}} +; GFX900: ds_read_i8 v +; GFX900: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} +; GFX900: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}} -; VI: ds_read_i8 v +; NO-D16-HI: ds_read_i8 v define void @load_local_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 { entry: %load = load i8, i8 addrspace(3)* %in @@ -189,13 +186,123 @@ entry: ret void } +; GCN-LABEL: {{^}}load_local_lo_v2f16_reglo_vreg_zexti8: +; GCN: s_waitcnt +; GFX900: ds_read_u8 v +; GFX900: global_store_dword +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; NO-D16-HI: ds_read_u8 v +define void @load_local_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 { +entry: + %load = load i8, i8 addrspace(3)* %in + %ext = zext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + %build0 = insertelement <2 x half> undef, half %reg, i32 1 + %build1 = insertelement <2 x half> %build0, half %bitcast, i32 0 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_local_lo_v2f16_reglo_vreg_sexti8: +; GCN: s_waitcnt +; GFX900: ds_read_i8 v +; GFX900: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} +; GFX900: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}} + +; NO-D16-HI: ds_read_i8 v +define void @load_local_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 { +entry: + %load = load i8, i8 addrspace(3)* %in + %ext = sext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + %build0 = insertelement <2 x half> undef, half %reg, i32 1 + %build1 = insertelement <2 x half> %build0, half %bitcast, i32 0 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_multi_use_lo: +; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900: ds_read_u16 v0, v0 +; GFX900: v_mov_b32_e32 v3, 0 +; GFX900: v_mov_b32_e32 v2, 0xffff +; GFX900: s_waitcnt lgkmcnt(0) +; GFX900: ds_write_b16 v3, v0 +; GFX900: v_bfi_b32 v0, v2, v0, v1 +; GFX900: global_store_dword v[0:1], v0, off +; GFX900: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900: s_setpc_b64 s[30:31] + +; NO-D16-HI: ds_read_u16 v +define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in, <2 x i16> %reg) #0 { +entry: + %load = load i16, i16 addrspace(3)* %in + %elt1 = extractelement <2 x i16> %reg, i32 1 + store i16 %load, i16 addrspace(3)* null + %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_multi_use_hi: +; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900: v_lshrrev_b32_e32 v2, 16, v1 +; GFX900: ds_read_u16_d16 v1, v0 +; GFX900: v_mov_b32_e32 v0, 0 +; GFX900: ds_write_b16 v0, v2 +; GFX900: s_waitcnt lgkmcnt(1) +; GFX900: global_store_dword v[0:1], v1, off +; GFX900: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900: s_setpc_b64 s[30:31] + +; NO-D16-HI: ds_read_u16 v +define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in, <2 x i16> %reg) #0 { +entry: + %load = load i16, i16 addrspace(3)* %in + %elt1 = extractelement <2 x i16> %reg, i32 1 + store i16 %elt1, i16 addrspace(3)* null + %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_multi_use_lohi: +; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900: ds_read_u16 v0, v0 +; GFX900: v_lshrrev_b32_e32 v4, 16, v1 +; GFX900: s_waitcnt lgkmcnt(0) +; GFX900: ds_write_b16 v2, v0 +; GFX900: ds_write_b16 v3, v4 +; GFX900: v_mov_b32_e32 v2, 0xffff +; GFX900: v_bfi_b32 v0, v2, v0, v1 +; GFX900: global_store_dword v[0:1], v0, off +; GFX900: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900: s_setpc_b64 s[30:31] + +; NO-D16-HI: ds_read_u16 v +define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noalias %in, <2 x i16> %reg, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 { +entry: + %load = load i16, i16 addrspace(3)* %in + %elt1 = extractelement <2 x i16> %reg, i32 1 + store i16 %load, i16 addrspace(3)* %out0 + store i16 %elt1, i16 addrspace(3)* %out1 + %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + ; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg: ; GCN: s_waitcnt -; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; GFX906: global_load_ushort v0, v[0:1], off offset:-4094 +; GFX906: v_bfi_b32 define void @load_global_lo_v2i16_reglo_vreg(i16 addrspace(1)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -208,11 +315,18 @@ entry: ; GCN-LABEL: {{^}}load_global_lo_v2f16_reglo_vreg: ; GCN: s_waitcnt -; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; GFX906: global_load_ushort v0, v[0:1], off offset:-4094 +; GFX906: v_lshrrev_b32 +; GFX906: v_and_b32_e32 +; GFX906: v_lshl_or_b32 + +; GFX803: flat_load_ushort define void @load_global_lo_v2f16_reglo_vreg(half addrspace(1)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -225,11 +339,16 @@ entry: ; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg_zexti8: ; GCN: s_waitcnt -; GFX9-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; GFX906: global_load_ubyte v0, v[0:1], off offset:-4095 +; GFX906: v_bfi_b32 + +; GFX803: flat_load_ubyte define void @load_global_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -243,11 +362,16 @@ entry: ; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg_sexti8: ; GCN: s_waitcnt -; GFX9-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; GFX906: global_load_sbyte v0, v[0:1], off offset:-4095 +; GFX906: v_bfi_b32 + +; GFX803: flat_load_sbyte define void @load_global_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -259,16 +383,72 @@ entry: ret void } -; GCN-LABEL: {{^}}load_flat_lo_v2i16_reghi_vreg: +; GCN-LABEL: {{^}}load_global_lo_v2f16_reglo_vreg_zexti8: ; GCN: s_waitcnt -; GFX9-NEXT: flat_load_short_d16 v2, v[0:1] -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v[0:1], v2 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; GFX906: global_load_ubyte v0, v[0:1], off offset:-4095 +; GFX906: v_and_b32_e32 +; GFX906: v_lshl_or_b32 + +; GFX803: flat_load_ubyte +define void @load_global_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 { +entry: + %reg.bc = bitcast i32 %reg to <2 x half> + %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 + %load = load i8, i8 addrspace(1)* %gep + %ext = zext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_global_lo_v2f16_reglo_vreg_sexti8: +; GCN: s_waitcnt +; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; GFX906: global_load_sbyte v0, v[0:1], off offset:-4095 +; GFX906: v_lshrrev_b32 +; GFX906: v_and_b32 +; GFX906: v_lshl_or_b32 + +; GFX803: flat_load_sbyte +define void @load_global_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 { +entry: + %reg.bc = bitcast i32 %reg to <2 x half> + %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 + %load = load i8, i8 addrspace(1)* %gep + %ext = sext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} -; VI: flat_load_ushort v{{[0-9]+}} -; VI: v_or_b32_e32 +; GCN-LABEL: {{^}}load_flat_lo_v2i16_reghi_vreg: +; GCN: s_waitcnt +; GFX900-NEXT: flat_load_short_d16 v2, v[0:1] +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v[0:1], v2 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; GFX803: flat_load_ushort v{{[0-9]+}} +; GFX803: v_or_b32_e32 + +; GFX906: flat_load_ushort [[LOAD:v[0-9]+]] +; GFX906: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}} +; GFX906: v_bfi_b32 v{{[0-9]+}}, [[MASK]], [[LOAD]], v2 +; GFX906: global_store_dword define void @load_flat_lo_v2i16_reghi_vreg(i16* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -280,14 +460,21 @@ entry: ; GCN-LABEL: {{^}}load_flat_lo_v2f16_reghi_vreg: ; GCN: s_waitcnt -; GFX9-NEXT: flat_load_short_d16 v2, v[0:1] -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v[0:1], v2 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 - -; VI: flat_load_ushort v{{[0-9]+}} -; VI: v_or_b32_e32 +; GFX900-NEXT: flat_load_short_d16 v2, v[0:1] +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v[0:1], v2 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; GFX803: flat_load_ushort v{{[0-9]+}} +; GFX803: v_or_b32_e32 + +; FIXME: and should be removable +; GFX906: flat_load_ushort [[LOAD:v[0-9]+]] +; GFX906: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, v2 +; GFX906: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LOAD]] +; GFX906: v_lshl_or_b32 [[LSHL_OR:v[0-9]+]], [[SHR]], 16, [[AND]] +; GFX906: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[LSHL_OR]] define void @load_flat_lo_v2f16_reghi_vreg(half* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -299,17 +486,20 @@ entry: ; GCN-LABEL: {{^}}load_flat_lo_v2i16_reglo_vreg_zexti8: ; GCN: s_waitcnt -; GFX9-NEXT: flat_load_ubyte_d16 v2, v[0:1] -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v[0:1], v2 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 - -; VI: flat_load_ubyte [[LO:v[0-9]+]] -; VI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, v2 -; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x5040c00 -; VI: v_perm_b32 [[RES:v[0-9]+]], [[HI]], [[LO]], [[MASK]] -; VI: flat_store_dword v[0:1], [[RES]] +; GFX900-NEXT: flat_load_ubyte_d16 v2, v[0:1] +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v[0:1], v2 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; GFX803: flat_load_ubyte [[LO:v[0-9]+]] +; GFX803: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, v2 +; GFX803: s_mov_b32 [[MASK:s[0-9]+]], 0x5040c00 +; GFX803: v_perm_b32 [[RES:v[0-9]+]], [[HI]], [[LO]], [[MASK]] +; GFX803: flat_store_dword v[0:1], [[RES]] + +; GFX906: flat_load_ubyte +; GFX906: v_bfi_b32 define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -322,15 +512,17 @@ entry: ; GCN-LABEL: {{^}}load_flat_lo_v2i16_reglo_vreg_sexti8: ; GCN: s_waitcnt -; GFX9-NEXT: flat_load_sbyte_d16 v2, v[0:1] -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v[0:1], v2 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: flat_load_sbyte_d16 v2, v[0:1] +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v[0:1], v2 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: flat_load_sbyte v{{[0-9]+}} -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803: flat_load_sbyte v{{[0-9]+}} +; GFX803: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906: flat_load_sbyte +; GFX906: v_bfi_b32 define void @load_flat_lo_v2i16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -341,15 +533,70 @@ entry: ret void } +; GCN-LABEL: {{^}}load_flat_lo_v2f16_reglo_vreg_zexti8: +; GCN: s_waitcnt +; GFX900-NEXT: flat_load_ubyte_d16 v2, v[0:1] +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v[0:1], v2 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; GFX803: flat_load_ubyte [[LO:v[0-9]+]] +; GFX803: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, v2 +; GFX803: s_mov_b32 [[MASK:s[0-9]+]], 0x5040c00 +; GFX803: v_perm_b32 [[RES:v[0-9]+]], [[HI]], [[LO]], [[MASK]] +; GFX803: flat_store_dword v[0:1], [[RES]] + +; GFX906: flat_load_ubyte +; GFX906: v_lshrrev_b32 +; GFX906: v_and_b32 +; GFX906: v_lshl_or_b32 +define void @load_flat_lo_v2f16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 { +entry: + %reg.bc = bitcast i32 %reg to <2 x half> + %load = load i8, i8* %in + %ext = zext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_flat_lo_v2f16_reglo_vreg_sexti8: +; GCN: s_waitcnt +; GFX900-NEXT: flat_load_sbyte_d16 v2, v[0:1] +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v[0:1], v2 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; GFX803: flat_load_sbyte v{{[0-9]+}} +; GFX803: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + +; GFX906: flat_load_sbyte +; GFX906: v_lshrrev_b32 +; GFX906: v_and_b32 +; GFX906: v_lshl_or_b32 +define void @load_flat_lo_v2f16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 { +entry: + %reg.bc = bitcast i32 %reg to <2 x half> + %load = load i8, i8* %in + %ext = sext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg: ; GCN: s_waitcnt -; GFX9: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094{{$}} -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -362,16 +609,16 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg: ; GCN: s_waitcnt -; GFX9: buffer_load_ushort v1, off, s[0:3], s5 offset:4094{{$}} -; GFX9-NEXT: s_waitcnt -; GFX9: v_and_b32 -; GFX9: v_lshl_or_b32 +; GFX900: buffer_load_ushort v1, off, s[0:3], s5 offset:4094{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900: v_and_b32 +; GFX900: v_lshl_or_b32 -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 { entry: %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045 @@ -384,13 +631,13 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg: ; GCN: s_waitcnt -; GFX9: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094{{$}} -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -403,13 +650,13 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}} -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -421,13 +668,13 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg_nooff: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}} -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -439,13 +686,13 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}} -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -457,13 +704,13 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8: ; GCN: s_waitcnt -; GFX9: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095{{$}} -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -477,13 +724,13 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8: ; GCN: s_waitcnt -; GFX9: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095{{$}} -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -497,13 +744,13 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s4 offset:4094{{$}} -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s4 offset:4094{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} +; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -516,13 +763,13 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_sexti8: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], s4 offset:4094{{$}} -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], s4 offset:4094{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}} +; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}} define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -535,13 +782,13 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s4 offset:4094{{$}} -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s4 offset:4094{{$}} +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} +; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -555,13 +802,15 @@ entry: ; GCN-LABEL: {{^}}load_constant_lo_v2i16_reglo_vreg: ; GCN: s_waitcnt -; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; GFX803: flat_load_ushort -; VI: flat_load_ushort +; GFX906: global_load_ushort define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(4)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -574,13 +823,15 @@ entry: ; GCN-LABEL: load_constant_lo_v2f16_reglo_vreg ; GCN: s_waitcnt -; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 -; VI: flat_load_ushort +; GFX803: flat_load_ushort + +; GFX906: global_load_ushort define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(4)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -591,11 +842,62 @@ entry: ret void } +; GCN-LABEL: {{^}}load_constant_lo_v2f16_reglo_vreg_zexti8: +; GCN: s_waitcnt +; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; GFX906: global_load_ubyte v0, v[0:1], off offset:-4095 +; GFX906: v_and_b32_e32 +; GFX906: v_lshl_or_b32 + +; GFX803: flat_load_ubyte +define void @load_constant_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i32 %reg) #0 { +entry: + %reg.bc = bitcast i32 %reg to <2 x half> + %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095 + %load = load i8, i8 addrspace(4)* %gep + %ext = zext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_constant_lo_v2f16_reglo_vreg_sexti8: +; GCN: s_waitcnt +; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: global_store_dword +; GFX900-NEXT: s_waitcnt +; GFX900-NEXT: s_setpc_b64 + +; GFX906: global_load_sbyte v0, v[0:1], off offset:-4095 +; GFX906: v_lshrrev_b32 +; GFX906: v_and_b32 +; GFX906: v_lshl_or_b32 + +; GFX803: flat_load_sbyte +define void @load_constant_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i32 %reg) #0 { +entry: + %reg.bc = bitcast i32 %reg to <2 x half> + %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095 + %load = load i8, i8 addrspace(4)* %gep + %ext = sext i8 %load to i16 + %bitcast = bitcast i16 %ext to half + %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_to_offset: -; GFX9: buffer_store_dword -; GFX9-NEXT: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094 +; GFX900: buffer_store_dword +; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094 -; VI: buffer_load_ushort v +; NO-D16-HI: buffer_load_ushort v define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -611,10 +913,10 @@ entry: } ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8_to_offset: -; GFX9: buffer_store_dword -; GFX9-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095 +; GFX900: buffer_store_dword +; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095 -; VI: buffer_load_sbyte v +; NO-D16-HI: buffer_load_sbyte v define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -631,10 +933,10 @@ entry: } ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8_to_offset: -; GFX9: buffer_store_dword -; GFX9-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095 +; GFX900: buffer_store_dword +; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095 -; VI: buffer_load_ubyte v +; NO-D16-HI: buffer_load_ubyte v define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -650,4 +952,46 @@ entry: ret void } +; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_sexti8_to_offset: +; GFX900: buffer_store_dword +; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095 + +; NO-D16-HI: buffer_load_sbyte v +define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 { +entry: + %obj0 = alloca [10 x i32], align 4, addrspace(5) + %obj1 = alloca [4096 x i8], align 2, addrspace(5) + %reg.bc = bitcast i32 %reg to <2 x half> + %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* + store volatile i32 123, i32 addrspace(5)* %bc + %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 + %load = load volatile i8, i8 addrspace(5)* %gep + %load.ext = sext i8 %load to i16 + %bitcast = bitcast i16 %load.ext to half + %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_zexti8_to_offset: +; GFX900: buffer_store_dword +; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095 + +; NO-D16-HI: buffer_load_ubyte v +define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 { +entry: + %obj0 = alloca [10 x i32], align 4, addrspace(5) + %obj1 = alloca [4096 x i8], align 2, addrspace(5) + %reg.bc = bitcast i32 %reg to <2 x half> + %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* + store volatile i32 123, i32 addrspace(5)* %bc + %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 + %load = load volatile i8, i8 addrspace(5)* %gep + %load.ext = zext i8 %load to i16 + %bitcast = bitcast i16 %load.ext to half + %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + attributes #0 = { nounwind }