From df77c9ada4c172def64f0a3514df6a34a223c409 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Tue, 12 Apr 2016 21:18:10 +0000 Subject: [PATCH] AMDGPU: add llvm.amdgcn.buffer.load/store intrinsics Summary: They correspond to BUFFER_LOAD/STORE_DWORD[_X2,X3,X4] and mostly behave like llvm.amdgcn.buffer.load/store.format. They will be used by Mesa for SSBO and atomic counters at least when robust buffer access behavior is desired. (These instructions perform no format conversion and do buffer range checking per component.) As a side effect of sharing patterns with llvm.amdgcn.buffer.store.format, it has become trivial to add support for the f32 and v2f32 variants of that intrinsic, so the patch does so. Also DAG-ify (and fix) some tests that I noticed intermittent failures in while developing this patch. Some tests were (temporarily) adjusted for the required mayLoad/hasSideEffects changes to the BUFFER_STORE_DWORD* instructions. See also http://reviews.llvm.org/D18291. Reviewers: arsenm, tstellarAMD, mareko Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D18292 llvm-svn: 266126 --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 10 +- llvm/lib/Target/AMDGPU/SIInstructions.td | 128 +++++++++++---------- llvm/test/CodeGen/AMDGPU/captured-frame-index.ll | 6 +- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 4 +- llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll | 1 - .../test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll | 108 +++++++++++++++++ .../AMDGPU/llvm.amdgcn.buffer.store.format.ll | 18 +++ .../CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll | 95 +++++++++++++++ .../CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll | 8 +- llvm/test/CodeGen/AMDGPU/sminmax.ll | 24 ++-- 10 files changed, 318 insertions(+), 84 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 47b62cc..24e8c04 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -224,7 +224,7 @@ def int_amdgcn_image_atomic_cmpswap : Intrinsic < llvm_i1_ty], // slc(imm) []>; -def int_amdgcn_buffer_load_format : Intrinsic < +class AMDGPUBufferLoad : Intrinsic < [llvm_anyfloat_ty], [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // vindex(VGPR) @@ -232,16 +232,20 @@ def int_amdgcn_buffer_load_format : Intrinsic < llvm_i1_ty, // glc(imm) llvm_i1_ty], // slc(imm) [IntrReadMem]>; +def int_amdgcn_buffer_load_format : AMDGPUBufferLoad; +def int_amdgcn_buffer_load : AMDGPUBufferLoad; -def int_amdgcn_buffer_store_format : Intrinsic < +class AMDGPUBufferStore : Intrinsic < [], - [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select v4f32 + [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32 llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(SGPR/VGPR/imm) llvm_i1_ty, // glc(imm) llvm_i1_ty], // slc(imm) []>; +def int_amdgcn_buffer_store_format : AMDGPUBufferStore; +def int_amdgcn_buffer_store : AMDGPUBufferStore; class AMDGPUBufferAtomic : Intrinsic < [llvm_i32_ty], diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index f6244d4..3363fcc 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -996,6 +996,11 @@ defm BUFFER_STORE_SHORT : MUBUF_Store_Helper < mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global >; +// Without mayLoad and hasSideEffects, TableGen complains about the pattern +// matching llvm.amdgcn.buffer.store. Eventually, we'll want a WriteOnly +// property to express the effects of this intrinsic more precisely, see +// http://reviews.llvm.org/D18291 +let mayLoad = 1, hasSideEffects = 1 in { defm BUFFER_STORE_DWORD : MUBUF_Store_Helper < mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store >; @@ -1007,6 +1012,7 @@ defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store >; +} defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic < mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global @@ -2140,41 +2146,36 @@ def : Pat < // buffer_load/store_format patterns //===----------------------------------------------------------------------===// -multiclass MUBUF_LoadIntrinsicPat { +multiclass MUBUF_LoadIntrinsicPat { def : Pat< - (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, 0, - (MUBUFIntrinsicOffset i32:$soffset, - i16:$offset), - imm:$glc, imm:$slc)), + (vt (name v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc)), (!cast(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; def : Pat< - (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicOffset i32:$soffset, - i16:$offset), - imm:$glc, imm:$slc)), + (vt (name v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc)), (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; def : Pat< - (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, 0, - (MUBUFIntrinsicVOffset i32:$soffset, - i16:$offset, - i32:$voffset), - imm:$glc, imm:$slc)), + (vt (name v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc)), (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; def : Pat< - (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicVOffset i32:$soffset, - i16:$offset, - i32:$voffset), - imm:$glc, imm:$slc)), + (vt (name v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc)), (!cast(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), @@ -2182,50 +2183,59 @@ multiclass MUBUF_LoadIntrinsicPat { >; } -defm : MUBUF_LoadIntrinsicPat; -defm : MUBUF_LoadIntrinsicPat; -defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; -def : Pat< - (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, 0, - (MUBUFIntrinsicOffset i32:$soffset, - i16:$offset), - imm:$glc, imm:$slc), - (BUFFER_STORE_FORMAT_XYZW_OFFSET $vdata, $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) ->; +multiclass MUBUF_StoreIntrinsicPat { + def : Pat< + (name vt:$vdata, v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc), + (!cast(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; -def : Pat< - (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicOffset i32:$soffset, - i16:$offset), - imm:$glc, imm:$slc), - (BUFFER_STORE_FORMAT_XYZW_IDXEN $vdata, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) ->; + def : Pat< + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc), + (!cast(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; -def : Pat< - (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, 0, - (MUBUFIntrinsicVOffset i32:$soffset, - i16:$offset, - i32:$voffset), - imm:$glc, imm:$slc), - (BUFFER_STORE_FORMAT_XYZW_OFFEN $vdata, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) ->; + def : Pat< + (name vt:$vdata, v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc), + (!cast(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; -def : Pat< - (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicVOffset i32:$soffset, - i16:$offset, - i32:$voffset), - imm:$glc, imm:$slc), - (BUFFER_STORE_FORMAT_XYZW_BOTHEN - $vdata, - (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) ->; + def : Pat< + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc), + (!cast(opcode # _BOTHEN) + $vdata, + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; +} + +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; //===----------------------------------------------------------------------===// // buffer_atomic patterns diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll index 7e2b98e..21c8af4 100644 --- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -1,9 +1,9 @@ ; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}stored_fi_to_lds: -; GCN: s_load_dword [[LDSPTR:s[0-9]+]] -; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]] +; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ZERO1]] ; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]] ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]] diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 85dae75..6779886 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -33,8 +33,8 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> ; SI-NOT: bfe ; SI-NOT: v_cvt_f32_ubyte3_e32 ; SI-DAG: v_cvt_f32_ubyte2_e32 -; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll index 992ab5b..3c5b59c 100644 --- a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll @@ -32,7 +32,6 @@ define void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrs ; GCN-LABEL: {{^}}bitcast_int_to_fpvector_extract_0: ; GCN: buffer_load_dwordx2 ; GCN: v_add_i32 -; GCN: v_addc_u32 ; GCN: buffer_store_dword define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) { %a = load i64, i64 addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll new file mode 100644 index 0000000..3d5f69f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -0,0 +1,108 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}buffer_load: +;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], 0 +;CHECK: buffer_load_dwordx4 v[4:7], s[0:3], 0 glc +;CHECK: buffer_load_dwordx4 v[8:11], s[0:3], 0 slc +;CHECK: s_waitcnt +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0) + %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0) + %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1) + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs: +;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], 0 offset:42 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs_large: +;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff +;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], [[OFFSET]] offset:1 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 8192, i1 0, i1 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_idx: +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs: +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs_imm: +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:58 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +main_body: + %ofs = add i32 %1, 58 + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_both: +;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_both_reversed: +;CHECK: v_mov_b32_e32 v2, v0 +;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_x1: +;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0) + ret float %data +} + +;CHECK-LABEL: {{^}}buffer_load_x2: +;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0) + ret <2 x float> %data +} + +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 +declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 + +attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll index 0af9a3e..63dcd84 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll @@ -70,6 +70,24 @@ main_body: ret void } +;CHECK-LABEL: {{^}}buffer_store_x1: +;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { +main_body: + call void @llvm.amdgcn.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2: +;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) { +main_body: + call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +declare void @llvm.amdgcn.buffer.store.format.f32(float, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll new file mode 100644 index 0000000..c935f9b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll @@ -0,0 +1,95 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}buffer_store: +;CHECK: buffer_store_dwordx4 v[0:3], s[0:3], 0 +;CHECK: buffer_store_dwordx4 v[4:7], s[0:3], 0 glc +;CHECK: buffer_store_dwordx4 v[8:11], s[0:3], 0 slc +define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +main_body: + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0) + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_immoffs: +;CHECK: buffer_store_dwordx4 v[0:3], s[0:3], 0 offset:42 +define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +main_body: + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_idx: +;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { +main_body: + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_ofs: +;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +main_body: + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_both: +;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen +define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { +main_body: + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_both_reversed: +;CHECK: v_mov_b32_e32 v6, v4 +;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen +define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { +main_body: + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0) + ret void +} + +; Ideally, the register allocator would avoid the wait here +; +;CHECK-LABEL: {{^}}buffer_store_wait: +;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) expcnt(0) +;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +main_body: + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1: +;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { +main_body: + call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2: +;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 { +main_body: + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index 34b85cd..6e2bd96 100644 --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -11,8 +11,8 @@ declare void @llvm.amdgcn.s.barrier() #1 ; FUNC-LABEL: @reorder_local_load_global_store_local_load ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 +; CI-NEXT: buffer_store_dword ; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8 -; CI: buffer_store_dword define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 @@ -71,9 +71,9 @@ define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace } ; FUNC-LABEL: @reorder_constant_load_global_store_constant_load -; CI: buffer_store_dword ; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} ; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} +; CI-DAG: buffer_store_dword ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x2 ; CI: buffer_store_dword @@ -184,11 +184,11 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa } ; FUNC-LABEL: @reorder_global_offsets +; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 ; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 -; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 -; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 +; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 ; CI: buffer_store_dword ; CI: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.ll b/llvm/test/CodeGen/AMDGPU/sminmax.ll index e646605..bea3ac1 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.ll @@ -46,11 +46,11 @@ define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind } ; FUNC-LABEL: {{^}}v_abs_v2i32: -; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] -; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] +; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] +; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] ; GCN: v_add_i32 ; GCN: v_add_i32 @@ -97,15 +97,15 @@ define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind } ; FUNC-LABEL: {{^}}v_abs_v4i32: -; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] -; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] -; GCN: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]] -; GCN: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]] +; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] +; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] +; GCN-DAG: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]] +; GCN-DAG: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]] ; GCN: v_add_i32 ; GCN: v_add_i32 -- 2.7.4