From: Jeffrey Byrnes Date: Thu, 2 Mar 2023 00:29:03 +0000 (-0800) Subject: [AMDGPU] Vectorize misaligned global loads & stores X-Git-Tag: upstream/17.0.6~15883 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b89236a96f2f2f3e9b88d198585a8eda7fb2c443;p=platform%2Fupstream%2Fllvm.git [AMDGPU] Vectorize misaligned global loads & stores Based on experimentation on gfx906,908,90a and 1030, wider global loads / stores are more performant than multiple narrower ones independent of alignment -- this is especially true when combining 8 bit loads / stores, in which case speedup was usually 2x across all alignments. Differential Revision: https://reviews.llvm.org/D145170 Change-Id: I6ee6c76e6ace7fc373cc1b2aac3818fc1425a0c1 --- diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index a345b9d..50c7acd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -427,6 +427,12 @@ inline bool isFlatGlobalAddrSpace(unsigned AS) { AS == AMDGPUAS::CONSTANT_ADDRESS || AS > AMDGPUAS::MAX_AMDGPU_ADDRESS; } + +inline bool isExtendedGlobalAddrSpace(unsigned AS) { + return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || + AS > AMDGPUAS::MAX_AMDGPU_ADDRESS; +} } } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 73d1c2e..e8a79ac 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1546,18 +1546,14 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( return AlignedBy4; } - if (Subtarget->hasUnalignedBufferAccessEnabled()) { - // If we have a uniform constant load, it still requires using a slow - // buffer instruction if unaligned. - if (IsFast) { - // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so - // 2-byte alignment is worse than 1 unless doing a 2-byte access. - *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || - AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ? - Alignment >= Align(4) : Alignment != Align(2); - } + // So long as they are correct, wide global memory operations perform better + // than multiple smaller memory ops -- even when misaligned + if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) { + if (IsFast) + *IsFast = Size; - return true; + return Alignment >= Align(4) || + Subtarget->hasUnalignedBufferAccessEnabled(); } // Smaller than dword value must be aligned. diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index f1542f5..8a38e08 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -23,45 +23,31 @@ define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 { ; GFX7-UNALIGNED-LABEL: global_load_2xi16_align2: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v0 -; GFX7-UNALIGNED-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX7-UNALIGNED-NEXT: flat_load_ushort v2, v[2:3] -; GFX7-UNALIGNED-NEXT: flat_load_ushort v0, v[0:1] -; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-UNALIGNED-NEXT: flat_load_dword v0, v[0:1] ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_load_2xi16_align2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: global_load_ushort v3, v[0:1], off offset:2 +; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_load_2xi16_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_ushort v2, v[0:1], off -; GFX10-NEXT: global_load_ushort v3, v[0:1], off offset:2 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_load_2xi16_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_u16 v2, v[0:1], off -; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:2 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1 %p.0 = load i16, ptr addrspace(1) %p, align 2 @@ -94,50 +80,37 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 1 +; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-UNALIGNED-NEXT: s_add_u32 s2, s0, 2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-UNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-UNALIGNED-NEXT: flat_store_short v[0:1], v2 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 2 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-UNALIGNED-NEXT: flat_store_short v[0:1], v2 +; GFX7-UNALIGNED-NEXT: flat_store_dword v[0:1], v2 ; GFX7-UNALIGNED-NEXT: s_endpgm ; ; GFX9-LABEL: global_store_2xi16_align2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, 2 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_short v0, v1, s[0:1] -; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_store_2xi16_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, 2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_short v0, v1, s[0:1] -; GFX10-NEXT: global_store_short v0, v2, s[0:1] offset:2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_store_2xi16_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1 -; GFX11-NEXT: v_mov_b32_e32 v2, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll new file mode 100644 index 0000000..b8ecbae --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll @@ -0,0 +1,229 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1030 %s + + +; Function Attrs: mustprogress nounwind willreturn +define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { +; GFX908-LABEL: half8: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: half8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90A-NEXT: s_endpgm +; +; GFX1030-LABEL: half8: +; GFX1030: ; %bb.0: +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: v_mov_b32_e32 v4, 0 +; GFX1030-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] +; GFX1030-NEXT: s_waitcnt vmcnt(0) +; GFX1030-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX1030-NEXT: s_endpgm + %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0 + %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1 + %gep2 = getelementptr half, ptr addrspace(1) %0, i64 2 + %gep3 = getelementptr half, ptr addrspace(1) %0, i64 3 + %gep4 = getelementptr half, ptr addrspace(1) %0, i64 4 + %gep5 = getelementptr half, ptr addrspace(1) %0, i64 5 + %gep6 = getelementptr half, ptr addrspace(1) %0, i64 6 + %gep7 = getelementptr half, ptr addrspace(1) %0, i64 7 + %l0 = load half, ptr addrspace(1) %gep0, align 2 + %l1 = load half, ptr addrspace(1) %gep1, align 2 + %l2 = load half, ptr addrspace(1) %gep2, align 2 + %l3 = load half, ptr addrspace(1) %gep3, align 2 + %l4 = load half, ptr addrspace(1) %gep4, align 2 + %l5 = load half, ptr addrspace(1) %gep5, align 2 + %l6 = load half, ptr addrspace(1) %gep6, align 2 + %l7 = load half, ptr addrspace(1) %gep7, align 2 + %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0 + %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1 + %sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2 + %sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3 + %sgep4 = getelementptr half, ptr addrspace(1) %1, i64 4 + %sgep5 = getelementptr half, ptr addrspace(1) %1, i64 5 + %sgep6 = getelementptr half, ptr addrspace(1) %1, i64 6 + %sgep7 = getelementptr half, ptr addrspace(1) %1, i64 7 + store half %l0, ptr addrspace(1) %sgep0, align 2 + store half %l1, ptr addrspace(1) %sgep1, align 2 + store half %l2, ptr addrspace(1) %sgep2, align 2 + store half %l3, ptr addrspace(1) %sgep3, align 2 + store half %l4, ptr addrspace(1) %sgep4, align 2 + store half %l5, ptr addrspace(1) %sgep5, align 2 + store half %l6, ptr addrspace(1) %sgep6, align 2 + store half %l7, ptr addrspace(1) %sgep7, align 2 + ret void +} + +; Function Attrs: mustprogress nounwind willreturn +define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { +; GFX908-LABEL: half6: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX908-NEXT: v_mov_b32_e32 v3, 0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] +; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: half6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] +; GFX90A-NEXT: s_endpgm +; +; GFX1030-LABEL: half6: +; GFX1030: ; %bb.0: +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: v_mov_b32_e32 v3, 0 +; GFX1030-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] +; GFX1030-NEXT: s_waitcnt vmcnt(0) +; GFX1030-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] +; GFX1030-NEXT: s_endpgm + %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0 + %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1 + %gep2 = getelementptr half, ptr addrspace(1) %0, i64 2 + %gep3 = getelementptr half, ptr addrspace(1) %0, i64 3 + %gep4 = getelementptr half, ptr addrspace(1) %0, i64 4 + %gep5 = getelementptr half, ptr addrspace(1) %0, i64 5 + %l0 = load half, ptr addrspace(1) %gep0, align 1 + %l1 = load half, ptr addrspace(1) %gep1, align 1 + %l2 = load half, ptr addrspace(1) %gep2, align 1 + %l3 = load half, ptr addrspace(1) %gep3, align 1 + %l4 = load half, ptr addrspace(1) %gep4, align 1 + %l5 = load half, ptr addrspace(1) %gep5, align 1 + %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0 + %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1 + %sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2 + %sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3 + %sgep4 = getelementptr half, ptr addrspace(1) %1, i64 4 + %sgep5 = getelementptr half, ptr addrspace(1) %1, i64 5 + store half %l0, ptr addrspace(1) %sgep0, align 1 + store half %l1, ptr addrspace(1) %sgep1, align 1 + store half %l2, ptr addrspace(1) %sgep2, align 1 + store half %l3, ptr addrspace(1) %sgep3, align 1 + store half %l4, ptr addrspace(1) %sgep4, align 1 + store half %l5, ptr addrspace(1) %sgep5, align 1 + ret void +} + +; Function Attrs: mustprogress nounwind willreturn +define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { +; GFX908-LABEL: half4: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX908-NEXT: v_mov_b32_e32 v2, 0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v0, s0 +; GFX908-NEXT: v_mov_b32_e32 v1, s1 +; GFX908-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: half4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: s_endpgm +; +; GFX1030-LABEL: half4: +; GFX1030: ; %bb.0: +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: v_mov_b32_e32 v2, 0 +; GFX1030-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX1030-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-NEXT: v_mov_b32_e32 v0, s0 +; GFX1030-NEXT: v_mov_b32_e32 v1, s1 +; GFX1030-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX1030-NEXT: s_endpgm + %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0 + %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1 + %gep2 = getelementptr half, ptr addrspace(1) %0, i64 2 + %gep3 = getelementptr half, ptr addrspace(1) %0, i64 3 + %l0 = load half, ptr addrspace(1) %gep0, align 4 + %l1 = load half, ptr addrspace(1) %gep1, align 4 + %l2 = load half, ptr addrspace(1) %gep2, align 4 + %l3 = load half, ptr addrspace(1) %gep3, align 4 + %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0 + %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1 + %sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2 + %sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3 + store half %l0, ptr addrspace(1) %sgep0, align 4 + store half %l1, ptr addrspace(1) %sgep1, align 4 + store half %l2, ptr addrspace(1) %sgep2, align 4 + store half %l3, ptr addrspace(1) %sgep3, align 4 + ret void +} + + +; Function Attrs: mustprogress nounwind willreturn +define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { +; GFX908-LABEL: half2: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_load_dword v1, v0, s[0:1] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dword v0, v1, s[2:3] +; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: half2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm +; +; GFX1030-LABEL: half2: +; GFX1030: ; %bb.0: +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: v_mov_b32_e32 v0, 0 +; GFX1030-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-NEXT: global_load_dword v1, v0, s[0:1] +; GFX1030-NEXT: s_waitcnt vmcnt(0) +; GFX1030-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1030-NEXT: s_endpgm + %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0 + %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1 + %l0 = load half, ptr addrspace(1) %gep0 + %l1 = load half, ptr addrspace(1) %gep1 + %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0 + %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1 + store half %l0, ptr addrspace(1) %sgep0 + store half %l1, ptr addrspace(1) %sgep1 + ret void +} + + diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 971ae8e..adfe7c4 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -503,12 +503,12 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 0c43324..227c2f5 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -674,7 +674,9 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 @@ -682,11 +684,9 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: s_add_u32 s0, s2, 16 -; GCN-HSA-NEXT: s_addc_u32 s1, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 92fc6ef..a5b1fa8 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -1571,21 +1571,15 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_u32 s4, s2, 2 -; GCN-NEXT: s_addc_u32 s5, s3, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: flat_load_ushort v2, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: flat_load_ushort v0, v[0:1] +; GCN-NEXT: flat_load_dword v0, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_u32_e32 v3, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GCN-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GCN-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4 @@ -1601,19 +1595,16 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: global_load_ushort v1, v0, s[2:3] offset:2 -; GFX1030-NEXT: global_load_ushort v2, v0, s[2:3] -; GFX1030-NEXT: s_waitcnt vmcnt(1) -; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] ; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v1 -; GFX1030-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX1030-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX1030-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX1030-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX1030-NEXT: v_trunc_f32_e32 v3, v3 -; GFX1030-NEXT: v_fma_f32 v2, -v3, v1, v2 +; GFX1030-NEXT: v_fma_f32 v1, -v3, v2, v1 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v1 +; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v1|, v2 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo ; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll index 38cb6c9..3e93555 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple=amdgcn-amd-amdhsa --mcpu=hawaii -passes=load-store-vectorizer -S -o - %s | FileCheck %s ; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions @@ -59,9 +58,7 @@ define amdgpu_kernel void @merge_global_store_2_constants_0_i16(ptr addrspace(1) define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(ptr addrspace(1) %out) #0 { ; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align( -; CHECK-NEXT: [[OUT_GEP_1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT:%.*]], i32 1 -; CHECK-NEXT: store i16 123, ptr addrspace(1) [[OUT_GEP_1]], align 2 -; CHECK-NEXT: store i16 456, ptr addrspace(1) [[OUT]], align 2 +; CHECK-NEXT: store <2 x i16> , ptr addrspace(1) [[OUT:%.*]], align 2 ; CHECK-NEXT: ret void ; %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1 @@ -85,9 +82,7 @@ define amdgpu_kernel void @merge_global_store_2_constants_i16_align_1(ptr addrsp define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(ptr addrspace(1) %out) #0 { ; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align( -; CHECK-NEXT: [[OUT_GEP_1:%.*]] = getelementptr half, ptr addrspace(1) [[OUT:%.*]], i32 1 -; CHECK-NEXT: store half 0xH4000, ptr addrspace(1) [[OUT_GEP_1]], align 2 -; CHECK-NEXT: store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2 +; CHECK-NEXT: store <2 x half> , ptr addrspace(1) [[OUT:%.*]], align 2 ; CHECK-NEXT: ret void ; %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1