From 2f5a73820c1ef6fbc687608deee56fa9c8c711b7 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Wed, 4 Apr 2018 10:58:54 +0000 Subject: [PATCH] AMDGPU: Dimension-aware image intrinsics Summary: These new image intrinsics contain the texture type as part of their name and have each component of the address/coordinate as individual parameters. This is a preparatory step for implementing the A16 feature, where coordinates are passed as half-floats or -ints, but the Z compare value and texel offsets are still full dwords, making it difficult or impossible to distinguish between A16 on or off in the old-style intrinsics. Additionally, these intrinsics pass the 'texfailpolicy' and 'cachectrl' as i32 bit fields to reduce operand clutter and allow for future extensibility. v2: - gather4 supports 2darray images - fix a bug with 1D images on SI Change-Id: I099f309e0a394082a5901ea196c3967afb867f04 Reviewers: arsenm, rampitec, b-sumner Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye Differential Revision: https://reviews.llvm.org/D44939 llvm-svn: 329166 --- llvm/include/llvm/IR/Intrinsics.td | 14 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 378 +++++++++++++++++ llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp | 3 + llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h | 6 + llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td | 23 +- llvm/lib/Target/AMDGPU/MIMGInstructions.td | 195 +++++++++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 41 +- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 6 + .../AMDGPU/llvm.amdgcn.image.atomic.ll | 10 + .../CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll | 216 ++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll | 115 ++++++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll | 420 +++++++++++++++++++ .../AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll | 19 + .../AMDGPU/llvm.amdgcn.image.gather4.dim.ll | 160 +++++++ .../AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll | 39 ++ .../CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll | 459 +++++++++++++++++++++ 16 files changed, 2096 insertions(+), 8 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 4361c25..2d6dd2e 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -117,6 +117,7 @@ def IntrHasSideEffects : IntrinsicProperty; class LLVMType { ValueType VT = vt; + int isAny = 0; } class LLVMQualPointerType @@ -131,6 +132,8 @@ class LLVMPointerType class LLVMAnyPointerType : LLVMType{ LLVMType ElTy = elty; + + let isAny = 1; } // Match the type of another intrinsic parameter. Number is an index into the @@ -163,10 +166,12 @@ class LLVMVectorOfAnyPointersToElt : LLVMMatchType; class LLVMHalfElementsVectorType : LLVMMatchType; def llvm_void_ty : LLVMType; -def llvm_any_ty : LLVMType; -def llvm_anyint_ty : LLVMType; -def llvm_anyfloat_ty : LLVMType; -def llvm_anyvector_ty : LLVMType; +let isAny = 1 in { + def llvm_any_ty : LLVMType; + def llvm_anyint_ty : LLVMType; + def llvm_anyfloat_ty : LLVMType; + def llvm_anyvector_ty : LLVMType; +} def llvm_i1_ty : LLVMType; def llvm_i8_ty : LLVMType; def llvm_i16_ty : LLVMType; @@ -249,7 +254,6 @@ def llvm_v8f64_ty : LLVMType; // 8 x double def llvm_vararg_ty : LLVMType; // this means vararg here - //===----------------------------------------------------------------------===// // Intrinsic Definitions. //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 6ce3c86..6cf5c7f6 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -506,6 +506,384 @@ def int_amdgcn_image_atomic_cmpswap : Intrinsic < } // defset AMDGPUImageIntrinsics +} // TargetPrefix = "amdgcn" + +// New-style image intrinsics + +////////////////////////////////////////////////////////////////////////// +// Dimension-aware image intrinsics framework +////////////////////////////////////////////////////////////////////////// + +// Helper class to represent (type, name) combinations of arguments. The +// argument names are explanatory and used as DAG operand names for codegen +// pattern matching. +class AMDGPUArg { + LLVMType Type = ty; + string Name = name; +} + +// Return [AMDGPUArg, AMDGPUArg, names[1]>, ...] +class makeArgList names, LLVMType basety> { + list ret = + !listconcat([AMDGPUArg], + !foreach(name, !tail(names), AMDGPUArg, name>)); +} + +// Return arglist, with LLVMMatchType's references shifted by 'shift'. +class arglistmatchshift arglist, int shift> { + list ret = + !foreach(arg, arglist, + !if(!isa(arg.Type), + AMDGPUArg(arg.Type).Number, shift)>, + arg.Name>, + arg)); +} + +// Return the concatenation of the given arglists. LLVMMatchType's are adjusted +// accordingly, and shifted by an additional 'shift'. +class arglistconcat> arglists, int shift = 0> { + list ret = + !foldl([], arglists, lhs, rhs, + !listconcat( + lhs, + arglistmatchshift.ret)); +} + +// Represent texture/image types / dimensionality. +class AMDGPUDimProps coord_names, list slice_names> { + string Name = name; // e.g. "2darraymsaa" + bit DA = 0; // DA bit in MIMG encoding + + list CoordSliceArgs = + makeArgList.ret; + list CoordSliceIntArgs = + makeArgList.ret; + list GradientArgs = + makeArgList.ret; +} + +def AMDGPUDim1D : AMDGPUDimProps<"1d", ["s"], []>; +def AMDGPUDim2D : AMDGPUDimProps<"2d", ["s", "t"], []>; +def AMDGPUDim3D : AMDGPUDimProps<"3d", ["s", "t", "r"], []>; +let DA = 1 in { + def AMDGPUDimCube : AMDGPUDimProps<"cube", ["s", "t"], ["face"]>; + def AMDGPUDim1DArray : AMDGPUDimProps<"1darray", ["s"], ["slice"]>; + def AMDGPUDim2DArray : AMDGPUDimProps<"2darray", ["s", "t"], ["slice"]>; +} +def AMDGPUDim2DMsaa : AMDGPUDimProps<"2dmsaa", ["s", "t"], ["fragid"]>; +let DA = 1 in { + def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<"2darraymsaa", ["s", "t"], ["slice", "fragid"]>; +} + +def AMDGPUDims { + list NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D, + AMDGPUDimCube, AMDGPUDim1DArray, + AMDGPUDim2DArray]; + list Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa]; + list All = !listconcat(NoMsaa, Msaa); +} + +// Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof. +class AMDGPUSampleVariant extra_addr> { + string UpperCaseMod = ucmod; + string LowerCaseMod = lcmod; + + // {offset} {bias} {z-compare} + list ExtraAddrArgs = extra_addr; + bit Gradients = 0; + + // Name of the {lod} or {clamp} argument that is appended to the coordinates, + // if any. + string LodOrClamp = ""; +} + +// AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE +// AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4 +defset list AMDGPUSampleVariants = { + multiclass AMDGPUSampleHelper_Offset extra_addr> { + def NAME#lcmod : AMDGPUSampleVariant; + def NAME#lcmod#_o : AMDGPUSampleVariant< + ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg], extra_addr)>; + } + + multiclass AMDGPUSampleHelper_Compare extra_addr> { + defm NAME : AMDGPUSampleHelper_Offset; + defm NAME : AMDGPUSampleHelper_Offset< + "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg])>; + } + + multiclass AMDGPUSampleHelper_Clamp extra_addr> { + defm NAME : AMDGPUSampleHelper_Compare; + let LodOrClamp = "clamp" in + defm NAME : AMDGPUSampleHelper_Compare; + } + + defset list AMDGPUSampleVariantsNoGradients = { + defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>; + defm AMDGPUSample : AMDGPUSampleHelper_Clamp< + "_B", "_b", [AMDGPUArg]>; + let LodOrClamp = "lod" in + defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>; + defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>; + } + + let Gradients = 1 in { + defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>; + defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>; + } +} + +// Helper class to capture the profile of a dimension-aware image intrinsic. +// This information is used to generate the intrinsic's type and to inform +// codegen pattern matching. +class AMDGPUDimProfile { + AMDGPUDimProps Dim = dim; + string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod + + // These are entended to be overwritten by subclasses + bit IsSample = 0; + bit IsAtomic = 0; + list RetTypes = []; + list DataArgs = []; + list ExtraAddrArgs = []; + bit Gradients = 0; + string LodClampMip = ""; + + int NumRetAndDataAnyTypes = + !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b, + !add(a, b.isAny)); + + list AddrArgs = + arglistconcat<[ExtraAddrArgs, + !if(Gradients, dim.GradientArgs, []), + !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs), + !if(!eq(LodClampMip, ""), + [], + [AMDGPUArg, LodClampMip>]))], + NumRetAndDataAnyTypes>.ret; + list AddrTypes = !foreach(arg, AddrArgs, arg.Type); + list AddrDefaultArgs = + !foreach(arg, AddrArgs, + AMDGPUArg(arg.Type)), + !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type), + arg.Name>); + list AddrA16Args = + !foreach(arg, AddrArgs, + AMDGPUArg(arg.Type)), + !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type), + arg.Name>); +} + +class AMDGPUDimProfileCopy : AMDGPUDimProfile { + let IsSample = base.IsSample; + let IsAtomic = base.IsAtomic; + let RetTypes = base.RetTypes; + let DataArgs = base.DataArgs; + let ExtraAddrArgs = base.ExtraAddrArgs; + let Gradients = base.Gradients; + let LodClampMip = base.LodClampMip; +} + +class AMDGPUDimSampleProfile : AMDGPUDimProfile { + let IsSample = 1; + let RetTypes = [llvm_anyfloat_ty]; + let ExtraAddrArgs = sample.ExtraAddrArgs; + let Gradients = sample.Gradients; + let LodClampMip = sample.LodOrClamp; +} + +class AMDGPUDimNoSampleProfile retty, + list dataargs, + bit Mip = 0> : AMDGPUDimProfile { + let RetTypes = retty; + let DataArgs = dataargs; + let LodClampMip = !if(Mip, "mip", ""); +} + +class AMDGPUDimAtomicProfile dataargs> : AMDGPUDimProfile { + let RetTypes = [llvm_anyint_ty]; + let DataArgs = dataargs; + let IsAtomic = 1; +} + +class AMDGPUDimGetResInfoProfile : AMDGPUDimProfile<"GET_RESINFO", dim> { + let RetTypes = [llvm_anyfloat_ty]; + let DataArgs = []; + let AddrArgs = [AMDGPUArg]; + let LodClampMip = "mip"; +} + +// All dimension-aware intrinsics are derived from this class. +class AMDGPUImageDimIntrinsic props, + list sdnodeprops> : Intrinsic< + P_.RetTypes, // vdata(VGPR) -- for load/atomic-with-return + !listconcat( + !foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic + !if(P_.IsAtomic, [], [llvm_i32_ty]), // dmask(imm) + P_.AddrTypes, // vaddr(VGPR) + [llvm_v8i32_ty], // rsrc(SGPR) + !if(P_.IsSample, [llvm_v4i32_ty, // samp(SGPR) + llvm_i1_ty], []), // unorm(imm) + [llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe) + llvm_i32_ty]), // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + props, "", sdnodeprops>, + AMDGPURsrcIntrinsic { + AMDGPUDimProfile P = P_; + + let TargetPrefix = "amdgcn"; +} + +defset list AMDGPUImageDimIntrinsics = { + + ////////////////////////////////////////////////////////////////////////// + // Load and store intrinsics + ////////////////////////////////////////////////////////////////////////// + multiclass AMDGPUImageDimIntrinsicsNoMsaa retty, + list dataargs, + list props, + list sdnodeprops, + bit Mip = 0> { + foreach dim = AMDGPUDims.NoMsaa in { + def !strconcat(NAME, "_", dim.Name) + : AMDGPUImageDimIntrinsic< + AMDGPUDimNoSampleProfile, + props, sdnodeprops>; + } + } + + multiclass AMDGPUImageDimIntrinsicsAll retty, + list dataargs, + list props, + list sdnodeprops, + bit Mip = 0> { + foreach dim = AMDGPUDims.All in { + def !strconcat(NAME, "_", dim.Name) + : AMDGPUImageDimIntrinsic< + AMDGPUDimNoSampleProfile, + props, sdnodeprops>; + } + } + + defm int_amdgcn_image_load : AMDGPUImageDimIntrinsicsAll< + "LOAD", [llvm_anyfloat_ty], [], [IntrReadMem], [SDNPMemOperand]>; + defm int_amdgcn_image_load_mip : AMDGPUImageDimIntrinsicsNoMsaa< + "LOAD_MIP", [llvm_anyfloat_ty], [], [IntrReadMem], [SDNPMemOperand], 1>; + + defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll< + "STORE", [], [AMDGPUArg], + [IntrWriteMem], [SDNPMemOperand]>; + defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa< + "STORE_MIP", [], [AMDGPUArg], + [IntrWriteMem], [SDNPMemOperand], 1>; + + ////////////////////////////////////////////////////////////////////////// + // sample and getlod intrinsics + ////////////////////////////////////////////////////////////////////////// + multiclass AMDGPUImageDimSampleDims { + foreach dim = AMDGPUDims.NoMsaa in { + def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic< + AMDGPUDimSampleProfile, + !if(NoMem, [IntrNoMem], [IntrReadMem]), + !if(NoMem, [], [SDNPMemOperand])>; + } + } + + foreach sample = AMDGPUSampleVariants in { + defm int_amdgcn_image_sample # sample.LowerCaseMod : + AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>; + } + + defm int_amdgcn_image_getlod : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>; +} + +////////////////////////////////////////////////////////////////////////// +// getresinfo intrinsics (separate due to D16) +////////////////////////////////////////////////////////////////////////// +defset list AMDGPUImageDimGetResInfoIntrinsics = { + foreach dim = AMDGPUDims.All in { + def !strconcat("int_amdgcn_image_getresinfo_", dim.Name) + : AMDGPUImageDimIntrinsic, [IntrNoMem], []>; + } +} + +////////////////////////////////////////////////////////////////////////// +// gather4 intrinsics +////////////////////////////////////////////////////////////////////////// +defset list AMDGPUImageDimGatherIntrinsics = { + foreach sample = AMDGPUSampleVariantsNoGradients in { + foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in { + def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name: + AMDGPUImageDimIntrinsic< + AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>, + [IntrReadMem], [SDNPMemOperand]>; + } + } +} + +////////////////////////////////////////////////////////////////////////// +// atomic intrinsics +////////////////////////////////////////////////////////////////////////// +defset list AMDGPUImageDimAtomicIntrinsics = { + multiclass AMDGPUImageDimAtomicX dataargs> { + foreach dim = AMDGPUDims.All in { + def !strconcat(NAME, "_", dim.Name) + : AMDGPUImageDimIntrinsic< + AMDGPUDimAtomicProfile, + [], [SDNPMemOperand]>; + } + } + + multiclass AMDGPUImageDimAtomic { + defm "" : AMDGPUImageDimAtomicX, "vdata">]>; + } + + defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">; + defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">; + defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">; + defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">; + defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">; + defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">; + defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">; + defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">; + defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">; + defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">; + + // TODO: INC/DEC are weird: they seem to have a vdata argument in hardware, + // even though it clearly shouldn't be needed + defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">; + defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">; + + defm int_amdgcn_image_atomic_cmpswap : + AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg, "src">, + AMDGPUArg, "cmp">]>; +} + +////////////////////////////////////////////////////////////////////////// +// Buffer intrinsics +////////////////////////////////////////////////////////////////////////// + +let TargetPrefix = "amdgcn" in { + defset list AMDGPUBufferIntrinsics = { class AMDGPUBufferLoad : Intrinsic < diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index cb2064c..32118df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -29,6 +29,9 @@ namespace llvm { namespace AMDGPU { #define GET_RSRCINTRINSIC_IMPL #include "AMDGPUGenSearchableTables.inc" + +#define GET_D16IMAGEDIMINTRINSIC_IMPL +#include "AMDGPUGenSearchableTables.inc" } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 8f75b42..766ee3d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -63,6 +63,12 @@ struct RsrcIntrinsic { }; const RsrcIntrinsic *lookupRsrcIntrinsicByIntr(unsigned Intr); +struct D16ImageDimIntrinsic { + unsigned Intr; + unsigned D16HelperIntr; +}; +const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsicByIntr(unsigned Intr); + } // end AMDGPU namespace } // End llvm namespace diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 0b9fe078..fce7499 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -23,7 +23,11 @@ class RsrcIntrinsic : SearchableTable { } foreach intr = !listconcat(AMDGPUBufferIntrinsics, - AMDGPUImageIntrinsics) in { + AMDGPUImageIntrinsics, + AMDGPUImageDimIntrinsics, + AMDGPUImageDimGatherIntrinsics, + AMDGPUImageDimGetResInfoIntrinsics, + AMDGPUImageDimAtomicIntrinsics) in { def : RsrcIntrinsic(intr)>; } @@ -76,3 +80,20 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; + +foreach intr = AMDGPUImageDimAtomicIntrinsics in +def : SourceOfDivergence; + +class D16ImageDimIntrinsic : SearchableTable { + let SearchableFields = ["Intr"]; + let EnumNameField = ?; + + Intrinsic Intr = intr; + code D16HelperIntr = + !cast("AMDGPUIntrinsic::SI_image_d16helper_" # intr.P.OpMod # intr.P.Dim.Name); +} + +foreach intr = !listconcat(AMDGPUImageDimIntrinsics, + AMDGPUImageDimGatherIntrinsics) in { + def : D16ImageDimIntrinsic; +} diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 7a7b54e..2d2aaf7 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -464,6 +464,201 @@ defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o" //def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; } +/********** ============================== **********/ +/********** Dimension-aware image patterns **********/ +/********** ============================== **********/ + +class getDwordsType { + int NumDwords = dwords; + string suffix = !if(!lt(dwords, 1), ?, + !if(!eq(dwords, 1), "_V1", + !if(!eq(dwords, 2), "_V2", + !if(!le(dwords, 4), "_V4", + !if(!le(dwords, 8), "_V8", + !if(!le(dwords, 16), "_V16", ?)))))); + ValueType VT = !if(!lt(dwords, 1), ?, + !if(!eq(dwords, 1), f32, + !if(!eq(dwords, 2), v2f32, + !if(!le(dwords, 4), v4f32, + !if(!le(dwords, 8), v8f32, + !if(!le(dwords, 16), v16f32, ?)))))); + RegisterClass VReg = !if(!lt(dwords, 1), ?, + !if(!eq(dwords, 1), VGPR_32, + !if(!eq(dwords, 2), VReg_64, + !if(!le(dwords, 4), VReg_128, + !if(!le(dwords, 8), VReg_256, + !if(!le(dwords, 16), VReg_512, ?)))))); +} + +class makeRegSequence_Fold { + int idx = i; + dag lhs = d; +} + +// Generate a dag node which returns a vector register of class RC into which +// the source operands given by names have been inserted (assuming that each +// name corresponds to an operand whose size is equal to a subregister). +class makeRegSequence names> { + dag ret = + !if(!eq(!size(names), 1), + !dag(COPY_TO_REGCLASS, [?, RC], [names[0], ?]), + !foldl(makeRegSequence_Fold<0, (vt (IMPLICIT_DEF))>, names, f, name, + makeRegSequence_Fold< + !add(f.idx, 1), + !con((INSERT_SUBREG f.lhs), + !dag(INSERT_SUBREG, [?, !cast("sub"#f.idx)], + [name, ?]))>).lhs); +} + +class ImageDimPattern : GCNPat<(undef), (undef)> { + list AddrArgs = I.P.AddrDefaultArgs; + getDwordsType AddrDwords = getDwordsType; + + Instruction MI = + !cast(!strconcat("IMAGE_", I.P.OpMod, dop, AddrDwords.suffix, suffix)); + + // DAG fragment to match data arguments (vdata for store/atomic, dmask + // for non-atomic). + dag MatchDataDag = + !con(!dag(I, !foreach(arg, I.P.DataArgs, dty), + !foreach(arg, I.P.DataArgs, arg.Name)), + !if(I.P.IsAtomic, (I), (I i32:$dmask))); + + // DAG fragment to match vaddr arguments. + dag MatchAddrDag = !dag(I, !foreach(arg, AddrArgs, arg.Type.VT), + !foreach(arg, AddrArgs, arg.Name)); + + // DAG fragment to match sampler resource and unorm arguments. + dag MatchSamplerDag = !if(I.P.IsSample, (I v4i32:$sampler, i1:$unorm), (I)); + + // DAG node that generates the MI vdata for store/atomic + getDwordsType DataDwords = getDwordsType; + dag GenDataDag = + !if(I.P.IsAtomic, (MI makeRegSequence.ret), + !if(!size(I.P.DataArgs), (MI $vdata), (MI))); + + // DAG node that generates the MI vaddr + dag GenAddrDag = makeRegSequence.ret; + // DAG fragments that generate various inline flags + dag GenDmask = + !if(I.P.IsAtomic, (MI !add(!shl(1, DataDwords.NumDwords), -1)), + (MI (as_i32imm $dmask))); + dag GenGLC = + !if(I.P.IsAtomic, (MI 1), + (MI (bitextract_imm<0> $cachepolicy))); + + dag MatchIntrinsic = !con(MatchDataDag, + MatchAddrDag, + (I v8i32:$rsrc), + MatchSamplerDag, + (I 0/*texfailctrl*/, + i32:$cachepolicy)); + let PatternToMatch = + !if(!size(I.RetTypes), (dty MatchIntrinsic), MatchIntrinsic); + + bit IsCmpSwap = !and(I.P.IsAtomic, !eq(!size(I.P.DataArgs), 2)); + dag ImageInstruction = + !con(GenDataDag, + (MI GenAddrDag), + (MI $rsrc), + !if(I.P.IsSample, (MI $sampler), (MI)), + GenDmask, + !if(I.P.IsSample, (MI (as_i1imm $unorm)), (MI 1)), + GenGLC, + (MI (bitextract_imm<1> $cachepolicy), + 0, /* r128 */ + 0, /* tfe */ + 0 /*(as_i1imm $lwe)*/, + { I.P.Dim.DA })); + let ResultInstrs = [ + !if(IsCmpSwap, (EXTRACT_SUBREG ImageInstruction, sub0), ImageInstruction) + ]; +} + +foreach intr = !listconcat(AMDGPUImageDimIntrinsics, + AMDGPUImageDimGetResInfoIntrinsics) in { + def intr#_pat_v1 : ImageDimPattern; + def intr#_pat_v2 : ImageDimPattern; + def intr#_pat_v4 : ImageDimPattern; +} + +// v2f16 and v4f16 are used as data types to signal that D16 should be used. +// However, they are not (always) legal types, and the SelectionDAG requires us +// to legalize them before running any patterns. So we legalize them by +// converting to an int type of equal size and using an internal 'd16helper' +// intrinsic instead which signifies both the use of D16 and actually allows +// this integer-based return type. +multiclass ImageDimD16Helper { + let SubtargetPredicate = HasUnpackedD16VMem in { + def _unpacked_v1 : ImageDimPattern; + def _unpacked_v2 : ImageDimPattern; + def _unpacked_v4 : ImageDimPattern; + } // End HasUnpackedD16VMem. + + let SubtargetPredicate = HasPackedD16VMem in { + def _packed_v1 : ImageDimPattern; + // used on gfx810 + def _packed_v2 : ImageDimPattern; + // used on gfx900 + def _packed_v2_gfx9 : ImageDimPattern; + def _packed_v4 : ImageDimPattern; + } // End HasPackedD16VMem. +} + +foreach intr = AMDGPUImageDimIntrinsics in { + def intr#_d16helper_profile : AMDGPUDimProfileCopy { + let RetTypes = !foreach(ty, intr.P.RetTypes, llvm_any_ty); + let DataArgs = !foreach(arg, intr.P.DataArgs, AMDGPUArg); + } + + let TargetPrefix = "SI", isTarget = 1 in + def int_SI_image_d16helper_ # intr.P.OpMod # intr.P.Dim.Name : + AMDGPUImageDimIntrinsic(intr#"_d16helper_profile"), + intr.IntrProperties, intr.Properties>; + + defm intr#_d16 : + ImageDimD16Helper< + intr, !cast( + "int_SI_image_d16helper_" # intr.P.OpMod # intr.P.Dim.Name)>; +} + +foreach intr = AMDGPUImageDimGatherIntrinsics in { + def intr#_pat3 : ImageDimPattern; + + def intr#_d16helper_profile : AMDGPUDimProfileCopy { + let RetTypes = !foreach(ty, intr.P.RetTypes, llvm_any_ty); + let DataArgs = !foreach(arg, intr.P.DataArgs, AMDGPUArg); + } + + let TargetPrefix = "SI", isTarget = 1 in + def int_SI_image_d16helper_ # intr.P.OpMod # intr.P.Dim.Name : + AMDGPUImageDimIntrinsic(intr#"_d16helper_profile"), + intr.IntrProperties, intr.Properties>; + + let SubtargetPredicate = HasUnpackedD16VMem in { + def intr#_unpacked_v4 : + ImageDimPattern( + "int_SI_image_d16helper_" # intr.P.OpMod # intr.P.Dim.Name), + "_V4", v4i32, "_D16_gfx80">; + } // End HasUnpackedD16VMem. + + let SubtargetPredicate = HasPackedD16VMem in { + def intr#_packed_v4 : + ImageDimPattern( + "int_SI_image_d16helper_" # intr.P.OpMod # intr.P.Dim.Name), + "_V2", v2i32, "_D16">; + } // End HasPackedD16VMem. +} + +foreach intr = AMDGPUImageDimAtomicIntrinsics in { + def intr#_pat1 : ImageDimPattern; +} + /********** ======================= **********/ /********** Image sampling patterns **********/ /********** ======================= **********/ diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 2a7549e..6f68f63 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3677,9 +3677,23 @@ SDValue SITargetLowering::lowerIntrinsicWChain_IllegalReturnType(SDValue Op, Chain = Res.getValue(1); return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); } - default: + default: { + const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr = + AMDGPU::lookupD16ImageDimIntrinsicByIntr(IID); + if (D16ImageDimIntr) { + SmallVector Ops; + for (auto Value : Op.getNode()->op_values()) + Ops.push_back(Value); + Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32); + Res = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTList, Ops, + M->getMemoryVT(), M->getMemOperand()); + Chain = Res.getValue(1); + return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); + } + return SDValue(); } + } } void SITargetLowering::ReplaceNodeResults(SDNode *N, @@ -5151,9 +5165,32 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, M->getMemoryVT(), M->getMemOperand()); } - default: + default: { + const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr = + AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrinsicID); + if (D16ImageDimIntr) { + SDValue VData = Op.getOperand(2); + EVT StoreVT = VData.getValueType(); + if ((StoreVT == MVT::v2f16 && !isTypeLegal(StoreVT)) || + StoreVT == MVT::v4f16) { + VData = handleD16VData(VData, DAG); + + SmallVector Ops; + for (auto Value : Op.getNode()->op_values()) + Ops.push_back(Value); + Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32); + Ops[2] = VData; + + MemSDNode *M = cast(Op); + return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, Op->getVTList(), + Ops, M->getMemoryVT(), + M->getMemOperand()); + } + } + return Op; } + } } SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 777bb0a..8797253 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -572,6 +572,12 @@ return CurDAG->getTargetConstant( N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64); }]>; +class bitextract_imm : SDNodeXFormgetZExtValue(); + unsigned Bit = (Imm >> }] # bitnum # [{ ) & 1; + return CurDAG->getTargetConstant(Bit, SDLoc(N), MVT::i1); +}]>; + def SIMM16bit : PatLeaf <(imm), [{return isInt<16>(N->getSExtValue());}] >; diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll index 4a3b60f..76d501b 100644 --- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll +++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll @@ -104,6 +104,14 @@ main_body: ret float %r } +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32( +define float @image_atomic_add_2d(<8 x i32> inreg %rsrc, i32 inreg %s, i32 inreg %t, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + declare i32 @llvm.amdgcn.image.atomic.swap.i32(i32, i32, <8 x i32>, i1, i1, i1) #0 declare i32 @llvm.amdgcn.image.atomic.add.i32(i32, i32, <8 x i32>, i1, i1, i1) #0 declare i32 @llvm.amdgcn.image.atomic.sub.i32(i32, i32, <8 x i32>, i1, i1, i1) #0 @@ -118,4 +126,6 @@ declare i32 @llvm.amdgcn.image.atomic.inc.i32(i32, i32, <8 x i32>, i1, i1, i1) # declare i32 @llvm.amdgcn.image.atomic.dec.i32(i32, i32, <8 x i32>, i1, i1, i1) #0 declare i32 @llvm.amdgcn.image.atomic.cmpswap.i32(i32, i32, i32, <8 x i32>,i1, i1, i1) #0 +declare i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0 + attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll new file mode 100644 index 0000000..bc0356a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll @@ -0,0 +1,216 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}atomic_swap_1d: +; GCN: image_atomic_swap v0, v1, s[0:7] dmask:0x1 unorm glc{{$}} +define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_add_1d: +; GCN: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc{{$}} +define amdgpu_ps float @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_sub_1d: +; GCN: image_atomic_sub v0, v1, s[0:7] dmask:0x1 unorm glc{{$}} +define amdgpu_ps float @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_smin_1d: +; GCN: image_atomic_smin v0, v1, s[0:7] dmask:0x1 unorm glc{{$}} +define amdgpu_ps float @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_umin_1d: +; GCN: image_atomic_umin v0, v1, s[0:7] dmask:0x1 unorm glc{{$}} +define amdgpu_ps float @atomic_umin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_smax_1d: +; GCN: image_atomic_smax v0, v1, s[0:7] dmask:0x1 unorm glc{{$}} +define amdgpu_ps float @atomic_smax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_umax_1d: +; GCN: image_atomic_umax v0, v1, s[0:7] dmask:0x1 unorm glc{{$}} +define amdgpu_ps float @atomic_umax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_and_1d: +; GCN: image_atomic_and v0, v1, s[0:7] dmask:0x1 unorm glc{{$}} +define amdgpu_ps float @atomic_and_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_or_1d: +; GCN: image_atomic_or v0, v1, s[0:7] dmask:0x1 unorm glc{{$}} +define amdgpu_ps float @atomic_or_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_xor_1d: +; GCN: image_atomic_xor v0, v1, s[0:7] dmask:0x1 unorm glc{{$}} +define amdgpu_ps float @atomic_xor_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_inc_1d: +; GCN: image_atomic_inc v0, v1, s[0:7] dmask:0x1 unorm glc{{$}} +define amdgpu_ps float @atomic_inc_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_dec_1d: +; GCN: image_atomic_dec v0, v1, s[0:7] dmask:0x1 unorm glc{{$}} +define amdgpu_ps float @atomic_dec_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_cmpswap_1d: +; GCN: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc{{$}} +define amdgpu_ps float @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %swap, i32 %s) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_add_2d: +; GCN: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc{{$}} +define amdgpu_ps float @atomic_add_2d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_add_3d: +; GCN: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 unorm glc{{$}} +define amdgpu_ps float @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %r) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_add_cube: +; GCN: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 unorm glc da{{$}} +define amdgpu_ps float @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %face) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_add_1darray: +; GCN: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc da{{$}} +define amdgpu_ps float @atomic_add_1darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %slice) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_add_2darray: +; GCN: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 unorm glc da{{$}} +define amdgpu_ps float @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_add_2dmsaa: +; GCN: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 unorm glc{{$}} +define amdgpu_ps float @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %fragid) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_add_2darraymsaa: +; GCN: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 unorm glc da{{$}} +define amdgpu_ps float @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + +; GCN-LABEL: {{^}}atomic_add_1d_slc: +; GCN: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc slc{{$}} +define amdgpu_ps float @atomic_add_1d_slc(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) + %out = bitcast i32 %v to float + ret float %out +} + +declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0 + +declare i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll new file mode 100644 index 0000000..f43a799 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll @@ -0,0 +1,115 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + +; GCN-LABEL: {{^}}image_load_f16: +; GCN: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm d16{{$}} +define amdgpu_ps half @image_load_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { +main_body: + %tex = call half @llvm.amdgcn.image.load.2d.f16.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret half %tex +} + +; GCN-LABEL: {{^}}image_load_v2f16: +; UNPACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}} +; PACKED: image_load v0, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}} +define amdgpu_ps float @image_load_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { +main_body: + %tex = call <2 x half> @llvm.amdgcn.image.load.2d.v2f16.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast <2 x half> %tex to float + ret float %r +} + +; GCN-LABEL: {{^}}image_load_v4f16: +; UNPACKED: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} +; PACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} +define amdgpu_ps <2 x float> @image_load_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast <4 x half> %tex to <2 x float> + ret <2 x float> %r +} + +; GCN-LABEL: {{^}}image_load_mip_v4f16: +; UNPACKED: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm d16{{$}} +; PACKED: image_load_mip v[0:1], v[0:3], s[0:7] dmask:0xf unorm d16{{$}} +define amdgpu_ps <2 x float> @image_load_mip_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.load.mip.2d.v4f16.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast <4 x half> %tex to <2 x float> + ret <2 x float> %r +} + +; GCN-LABEL: {{^}}image_load_3d_v2f16: +; UNPACKED: image_load v[0:1], v[0:3], s[0:7] dmask:0x3 unorm d16{{$}} +; PACKED: image_load v0, v[0:3], s[0:7] dmask:0x3 unorm d16{{$}} +define amdgpu_ps float @image_load_3d_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) { +main_body: + %tex = call <2 x half> @llvm.amdgcn.image.load.3d.v2f16.i32(i32 3, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) + %x = bitcast <2 x half> %tex to float + ret float %x +} + +; GCN-LABEL: {{^}}image_store_f16 +; GCN: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16{{$}} +define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) { +main_body: + call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}image_store_v2f16 +; UNPACKED: v_lshrrev_b32_e32 +; UNPACKED: v_and_b32_e32 +; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}} +; PACKED: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}} +define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, float %in) { +main_body: + %data = bitcast float %in to <2 x half> + call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %data, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}image_store_v4f16 +; UNPACKED: v_lshrrev_b32_e32 +; UNPACKED: v_and_b32_e32 +; UNPACKED: v_lshrrev_b32_e32 +; UNPACKED: v_and_b32_e32 +; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} +; PACKED: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} +define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x float> %in) { +main_body: + %data = bitcast <2 x float> %in to <4 x half> + call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %data, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}image_store_mip_1d_v4f16 +; UNPACKED: v_lshrrev_b32_e32 +; UNPACKED: v_and_b32_e32 +; UNPACKED: v_lshrrev_b32_e32 +; UNPACKED: v_and_b32_e32 +; UNPACKED: image_store_mip v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} +; PACKED: image_store_mip v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} +define amdgpu_ps void @image_store_mip_1d_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %mip, <2 x float> %in) { +main_body: + %data = bitcast <2 x float> %in to <4 x half> + call void @llvm.amdgcn.image.store.mip.1d.v4f16.i32(<4 x half> %data, i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +declare half @llvm.amdgcn.image.load.2d.f16.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <2 x half> @llvm.amdgcn.image.load.2d.v2f16.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x half> @llvm.amdgcn.image.load.mip.2d.v4f16.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <2 x half> @llvm.amdgcn.image.load.3d.v2f16.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 + +declare void @llvm.amdgcn.image.store.2d.f16.i32(half, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.1d.v4f16.i32(<4 x half>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.3d.v2f16.i32(<2 x half>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll new file mode 100644 index 0000000..8234e2c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -0,0 +1,420 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}load_1d: +; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_2d: +; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_3d: +; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_cube: +; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_1darray: +; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_2darray: +; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_2dmsaa: +; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_2darraymsaa: +; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_mip_1d: +; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %mip) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_mip_2d: +; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_mip_3d: +; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r, i32 %mip) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_mip_cube: +; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_mip_1darray: +; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice, i32 %mip) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_mip_2darray: +; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}store_1d: +; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { +main_body: + call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_2d: +; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) { +main_body: + call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_3d: +; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %r) { +main_body: + call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_cube: +; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) { +main_body: + call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_1darray: +; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %slice) { +main_body: + call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_2darray: +; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) { +main_body: + call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_2dmsaa: +; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %fragid) { +main_body: + call void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_2darraymsaa: +; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +main_body: + call void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_mip_1d: +; GCN: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %mip) { +main_body: + call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_mip_2d: +; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %mip) { +main_body: + call void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_mip_3d: +; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %r, i32 %mip) { +main_body: + call void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_mip_cube: +; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %mip) { +main_body: + call void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_mip_1darray: +; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %slice, i32 %mip) { +main_body: + call void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_mip_2darray: +; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %mip) { +main_body: + call void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}getresinfo_1d: +; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i32 %mip) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}getresinfo_2d: +; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, i32 %mip) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}getresinfo_3d: +; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, i32 %mip) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}getresinfo_cube: +; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, i32 %mip) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}getresinfo_1darray: +; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, i32 %mip) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}getresinfo_2darray: +; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, i32 %mip) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}getresinfo_2dmsaa: +; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, i32 %mip) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}getresinfo_2darraymsaa: +; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, i32 %mip) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_1d_V1: +; GCN: image_load v0, v0, s[0:7] dmask:0x8 unorm{{$}} +define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret float %v +} + +; GCN-LABEL: {{^}}load_1d_V2: +; GCN: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm{{$}} +define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <2 x float> %v +} + +; GCN-LABEL: {{^}}store_1d_V1: +; GCN: image_store v0, v1, s[0:7] dmask:0x2 unorm{{$}} +define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, i32 %s) { +main_body: + call void @llvm.amdgcn.image.store.1d.f32.i32(float %vdata, i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_1d_V2: +; GCN: image_store v[0:1], v2, s[0:7] dmask:0xc unorm{{$}} +define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, i32 %s) { +main_body: + call void @llvm.amdgcn.image.store.1d.v2f32.i32(<2 x float> %vdata, i32 12, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}load_1d_glc: +; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc{{$}} +define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_1d_slc: +; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc{{$}} +define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_1d_glc_slc: +; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc{{$}} +define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}store_1d_glc: +; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc{{$}} +define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { +main_body: + call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) + ret void +} + +; GCN-LABEL: {{^}}store_1d_slc: +; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc{{$}} +define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { +main_body: + call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) + ret void +} + +; GCN-LABEL: {{^}}store_1d_glc_slc: +; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc{{$}} +define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { +main_body: + call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3) + ret void +} + +declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 + +declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 + +declare void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 + +declare <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #2 +declare <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #2 +declare <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #2 +declare <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #2 +declare <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #2 +declare <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #2 +declare <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #2 +declare <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #2 + +declare float @llvm.amdgcn.image.load.1d.f32.i32(i32, i32, <8 x i32>, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32, i32, <8 x i32>, i32, i32) #1 +declare void @llvm.amdgcn.image.store.1d.f32.i32(float, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.1d.v2f32.i32(<2 x float>, i32, i32, <8 x i32>, i32, i32) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll new file mode 100644 index 0000000..eb633e7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + +; GCN-LABEL: {{^}}image_gather4_b_2d_v4f16: +; UNPACKED: image_gather4_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x4 d16{{$}} +; PACKED: image_gather4_b v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x4 d16{{$}} +define amdgpu_ps <2 x float> @image_gather4_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.gather4.b.2d.v4f16.f32.f32(i32 4, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + %r = bitcast <4 x half> %tex to <2 x float> + ret <2 x float> %r +} + +declare <4 x half> @llvm.amdgcn.image.gather4.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll new file mode 100644 index 0000000..f7fe050 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll @@ -0,0 +1,160 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s + +; GCN-LABEL: {{^}}gather4_2d: +; GCN: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_cube: +; GCN: image_gather4 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 da{{$}} +define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32 1, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_2darray: +; GCN: image_gather4 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 da{{$}} +define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %slice) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32 1, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_2d: +; GCN: image_gather4_c v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_cl_2d: +; GCN: image_gather4_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 1, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_cl_2d: +; GCN: image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_b_2d: +; GCN: image_gather4_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_b_2d: +; GCN: image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_b_cl_2d: +; GCN: image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_b_cl_2d: +; GCN: image_gather4_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_l_2d: +; GCN: image_gather4_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 1, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_l_2d: +; GCN: image_gather4_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_lz_2d: +; GCN: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_lz_2d: +; GCN: image_gather4_c_lz v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_2d_dmask_2: +; GCN: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2{{$}} +define amdgpu_ps <4 x float> @gather4_2d_dmask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 2, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_2d_dmask_4: +; GCN: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4{{$}} +define amdgpu_ps <4 x float> @gather4_2d_dmask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 4, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_2d_dmask_8: +; GCN: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8{{$}} +define amdgpu_ps <4 x float> @gather4_2d_dmask_8(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 8, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll new file mode 100644 index 0000000..9619304 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -0,0 +1,39 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + +; GCN-LABEL: {{^}}image_sample_2d_f16: +; GCN: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16{{$}} +define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { +main_body: + %tex = call half @llvm.amdgcn.image.sample.2d.f16.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + ret half %tex +} + +; GCN-LABEL: {{^}}image_sample_c_d_1d_v2f16: +; UNPACKED: image_sample_c_d v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}} +; PACKED: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}} +define amdgpu_ps float @image_sample_c_d_1d_v2f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) { +main_body: + %tex = call <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + %r = bitcast <2 x half> %tex to float + ret float %r +} + +; GCN-LABEL: {{^}}image_sample_b_2d_v4f16: +; UNPACKED: image_sample_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}} +; PACKED: image_sample_b v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}} +define amdgpu_ps <2 x float> @image_sample_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + %r = bitcast <4 x half> %tex to <2 x float> + ret <2 x float> %r +} + +declare half @llvm.amdgcn.image.sample.2d.f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll new file mode 100644 index 0000000..b85b343 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -0,0 +1,459 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s + +; GCN-LABEL: {{^}}sample_1d: +; GCN: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_2d: +; GCN: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_3d: +; GCN: image_sample v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %r) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cube: +; GCN: image_sample v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf da{{$}} +define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_1darray: +; GCN: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf da{{$}} +define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %slice) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_2darray: +; GCN: image_sample v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf da{{$}} +define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %slice) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_1d: +; GCN: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_2d: +; GCN: image_sample_c v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cl_1d: +; GCN: image_sample_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32 15, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cl_2d: +; GCN: image_sample_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32 15, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cl_1d: +; GCN: image_sample_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cl_2d: +; GCN: image_sample_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_b_1d: +; GCN: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float %bias, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_b_2d: +; GCN: image_sample_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_b_1d: +; GCN: image_sample_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_b_2d: +; GCN: image_sample_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_b_cl_1d: +; GCN: image_sample_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_b_cl_2d: +; GCN: image_sample_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_b_cl_1d: +; GCN: image_sample_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_b_cl_2d: +; GCN: image_sample_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_d_1d: +; GCN: image_sample_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_d_2d: +; GCN: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_d_1d: +; GCN: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_d_2d: +; GCN: image_sample_c_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_d_cl_1d: +; GCN: image_sample_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_d_cl_2d: +; GCN: image_sample_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_d_cl_1d: +; GCN: image_sample_c_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_d_cl_2d: +; GCN: image_sample_c_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cd_1d: +; GCN: image_sample_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cd_2d: +; GCN: image_sample_cd v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cd_1d: +; GCN: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cd_2d: +; GCN: image_sample_c_cd v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cd_cl_1d: +; GCN: image_sample_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cd_cl_2d: +; GCN: image_sample_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cd_cl_1d: +; GCN: image_sample_c_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cd_cl_2d: +; GCN: image_sample_c_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_l_1d: +; GCN: image_sample_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_l_2d: +; GCN: image_sample_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_l_1d: +; GCN: image_sample_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_l_2d: +; GCN: image_sample_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_lz_1d: +; GCN: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_lz_2d: +; GCN: image_sample_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_lz_1d: +; GCN: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_lz_2d: +; GCN: image_sample_c_lz v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_d_o_2darray_V1: +; GCN: image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 da{{$}} +define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) { +main_body: + %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret float %v +} + +; GCN-LABEL: {{^}}sample_c_d_o_2darray_V2: +; GCN: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 da{{$}} +define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) { +main_body: + %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <2 x float> %v +} + +; GCN-LABEL: {{^}}sample_1d_unorm: +; GCN: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf unorm{{$}} +define amdgpu_ps <4 x float> @sample_1d_unorm(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 1, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_1d_glc: +; GCN: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf glc{{$}} +define amdgpu_ps <4 x float> @sample_1d_glc(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 1) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_1d_slc: +; GCN: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf slc{{$}} +define amdgpu_ps <4 x float> @sample_1d_slc(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 2) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_1d_glc_slc: +; GCN: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf glc slc{{$}} +define amdgpu_ps <4 x float> @sample_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 3) + ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } -- 2.7.4