From 00e063ab92345a00fe89b27c857ceaa281077166 Mon Sep 17 00:00:00 2001 From: Ryan Taylor Date: Tue, 19 Mar 2019 16:07:00 +0000 Subject: [PATCH] [AMDGPU] Add buffer/load 8/16 bit overloaded intrinsics Summary: Add buffer store/load 8/16 overloaded intrinsics for buffer, raw_buffer and struct_buffer Change-Id: I166a29f071b2ff4e4683fb0392564b1f223ac61d Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59265 llvm-svn: 356465 --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 4 +- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 22 +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 6 + llvm/lib/Target/AMDGPU/BUFInstructions.td | 6 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 113 ++++++++++++- llvm/lib/Target/AMDGPU/SIISelLowering.h | 10 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 14 ++ .../test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll | 185 +++++++++++++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll | 26 +++ .../CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll | 58 +++++++ .../CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll | 28 ++++ .../AMDGPU/llvm.amdgcn.struct.buffer.load.ll | 58 +++++++ .../AMDGPU/llvm.amdgcn.struct.buffer.store.ll | 28 ++++ 13 files changed, 555 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 46e025d..6fc1727 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -840,7 +840,7 @@ let TargetPrefix = "amdgcn" in { defset list AMDGPUBufferIntrinsics = { class AMDGPUBufferLoad : Intrinsic < - [llvm_anyfloat_ty], + [llvm_any_ty], [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(SGPR/VGPR/imm) @@ -861,7 +861,7 @@ def int_amdgcn_s_buffer_load : Intrinsic < class AMDGPUBufferStore : Intrinsic < [], - [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32 + [llvm_any_ty, // vdata(VGPR) llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(SGPR/VGPR/imm) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 5f35030..9fcc335 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4208,10 +4208,16 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(ATOMIC_LOAD_FMIN) NODE_NAME_CASE(ATOMIC_LOAD_FMAX) NODE_NAME_CASE(BUFFER_LOAD) + NODE_NAME_CASE(BUFFER_LOAD_UBYTE) + NODE_NAME_CASE(BUFFER_LOAD_USHORT) + NODE_NAME_CASE(BUFFER_LOAD_BYTE) + NODE_NAME_CASE(BUFFER_LOAD_SHORT) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(SBUFFER_LOAD) NODE_NAME_CASE(BUFFER_STORE) + NODE_NAME_CASE(BUFFER_STORE_BYTE) + NODE_NAME_CASE(BUFFER_STORE_SHORT) NODE_NAME_CASE(BUFFER_STORE_FORMAT) NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) @@ -4376,6 +4382,14 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( } break; } + case AMDGPUISD::BUFFER_LOAD_UBYTE: { + Known.Zero.setHighBits(24); + break; + } + case AMDGPUISD::BUFFER_LOAD_USHORT: { + Known.Zero.setHighBits(16); + break; + } case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast(Op.getOperand(0))->getZExtValue(); switch (IID) { @@ -4421,6 +4435,14 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( case AMDGPUISD::CARRY: case AMDGPUISD::BORROW: return 31; + case AMDGPUISD::BUFFER_LOAD_BYTE: + return 25; + case AMDGPUISD::BUFFER_LOAD_SHORT: + return 17; + case AMDGPUISD::BUFFER_LOAD_UBYTE: + return 24; + case AMDGPUISD::BUFFER_LOAD_USHORT: + return 16; case AMDGPUISD::FP_TO_FP16: case AMDGPUISD::FP16_ZEXT: return 16; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 97a6146..fadbcc6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -491,10 +491,16 @@ enum NodeType : unsigned { ATOMIC_LOAD_FMIN, ATOMIC_LOAD_FMAX, BUFFER_LOAD, + BUFFER_LOAD_UBYTE, + BUFFER_LOAD_USHORT, + BUFFER_LOAD_BYTE, + BUFFER_LOAD_SHORT, BUFFER_LOAD_FORMAT, BUFFER_LOAD_FORMAT_D16, SBUFFER_LOAD, BUFFER_STORE, + BUFFER_STORE_BYTE, + BUFFER_STORE_SHORT, BUFFER_STORE_FORMAT, BUFFER_STORE_FORMAT_D16, BUFFER_ATOMIC_SWAP, diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 902cc3e..408c15a 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1132,6 +1132,10 @@ defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; multiclass MUBUF_StoreIntrinsicPat { @@ -1196,6 +1200,8 @@ defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; //===----------------------------------------------------------------------===// // buffer_atomic patterns diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0aca05b..326577b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -216,11 +216,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -677,6 +681,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FCANONICALIZE); setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); @@ -5581,6 +5586,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, M->getMemOperand()); } @@ -5609,6 +5620,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, M->getMemOperand()); } @@ -5637,6 +5654,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, M->getMemOperand()); } @@ -6207,6 +6230,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); + + // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) + return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } @@ -6233,6 +6262,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); + + // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) + return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } @@ -6259,6 +6294,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); + + // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) + return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } @@ -6361,6 +6402,38 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, Offsets[2] = DAG.getConstant(0, DL, MVT::i32); } +// Handle 8 bit and 16 bit buffer loads +SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, + EVT LoadVT, SDLoc DL, + ArrayRef Ops, + MemSDNode *M) const { + EVT IntVT = LoadVT.changeTypeToInteger(); + unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ? + AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT; + + SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other); + SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList, + Ops, IntVT, + M->getMemOperand()); + SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL, + LoadVT.getScalarType(), BufferLoad); + return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL); +} + +// Handle 8 bit and 16 bit buffer stores +SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG, + EVT VDataType, SDLoc DL, + SDValue Ops[], + MemSDNode *M) const { + SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]); + Ops[1] = BufferStoreExt; + unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE : + AMDGPUISD::BUFFER_STORE_SHORT; + ArrayRef OpsRef = makeArrayRef(&Ops[0], 9); + return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType, + M->getMemOperand()); +} + static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT) { @@ -7692,6 +7765,43 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N, + DAGCombinerInfo &DCI) + const { + SDValue Src = N->getOperand(0); + auto *VTSign = cast(N->getOperand(1)); + + if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE && + VTSign->getVT() == MVT::i8) || + (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT && + VTSign->getVT() == MVT::i16)) && + Src.hasOneUse()) { + auto *M = cast(Src); + SDValue Ops[] = { + Src.getOperand(0), // Chain + Src.getOperand(1), // rsrc + Src.getOperand(2), // vindex + Src.getOperand(3), // voffset + Src.getOperand(4), // soffset + Src.getOperand(5), // offset + Src.getOperand(6), + Src.getOperand(7) + }; + // replace with BUFFER_LOAD_BYTE/SHORT + SDVTList ResList = DCI.DAG.getVTList(MVT::i32, + Src.getOperand(0).getValueType()); + unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ? + AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT; + SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N), + ResList, + Ops, M->getMemoryVT(), + M->getMemOperand()); + return DCI.DAG.getMergeValues({BufferLoadSignExt, + BufferLoadSignExt.getValue(1)}, SDLoc(N)); + } + return SDValue(); +} + SDValue SITargetLowering::performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -8940,7 +9050,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (getTargetMachine().getOptLevel() == CodeGenOpt::None) return SDValue(); - switch (N->getOpcode()) { default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); @@ -9007,6 +9116,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performXorCombine(N, DCI); case ISD::ZERO_EXTEND: return performZeroExtendCombine(N, DCI); + case ISD::SIGN_EXTEND_INREG: + return performSignExtendInRegCombine(N , DCI); case AMDGPUISD::FP_CLASS: return performClassCombine(N, DCI); case ISD::FCANONICALIZE: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index ee4c4bf..0168486 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -140,6 +140,7 @@ private: SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSignExtendInRegCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue getCanonicalConstantFP(SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const; @@ -192,6 +193,15 @@ private: void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, unsigned Align = 4) const; + // Handle 8 bit and 16 bit buffer loads + SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, + ArrayRef Ops, MemSDNode *M) const; + + // Handle 8 bit and 16 bit buffer stores + SDValue handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType, + SDLoc DL, SDValue Ops[], + MemSDNode *M) const; + public: SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 336404f..e10c45c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -127,6 +127,14 @@ def SDTBufferLoad : SDTypeProfile<1, 7, def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_ubyte : SDNode <"AMDGPUISD::BUFFER_LOAD_UBYTE", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_ushort : SDNode <"AMDGPUISD::BUFFER_LOAD_USHORT", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_byte : SDNode <"AMDGPUISD::BUFFER_LOAD_BYTE", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_short: SDNode <"AMDGPUISD::BUFFER_LOAD_SHORT", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16", @@ -145,6 +153,12 @@ def SDTBufferStore : SDTypeProfile<0, 8, def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_byte: SDNode <"AMDGPUISD::BUFFER_STORE_BYTE", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_short : SDNode <"AMDGPUISD::BUFFER_STORE_SHORT", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", SDTBufferStore, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll index bcde25a..5ccc708 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -257,9 +257,194 @@ main_body: ret void } +;CHECK-LABEL: {{^}}buffer_load_ubyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ubyte(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %val = uitofp i8 %tmp to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_ushort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:16 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ushort(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + %tmp2 = zext i16 %tmp to i32 + %val = uitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sbyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sbyte(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = sext i8 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sshort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:16 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sshort(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + %tmp2 = sext i16 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_ubyte_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ubyte_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = zext i8 %tmp to i32 + %val = bitcast i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_ushort_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ushort_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = zext i16 %tmp to i32 + %val = bitcast i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sbyte_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sbyte_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = sext i8 %tmp to i32 + %val = bitcast i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sshort_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sshort_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = sext i16 %tmp to i32 + %val = bitcast i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_ubyte_mul_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ubyte_mul_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = zext i8 %tmp to i32 + %tmp3 = mul i32 %tmp2, 255 + %val = bitcast i32 %tmp3 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_ushort_mul_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ushort_mul_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = zext i16 %tmp to i32 + %tmp3 = mul i32 %tmp2, 255 + %val = bitcast i32 %tmp3 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sbyte_mul_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sbyte_mul_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = sext i8 %tmp to i32 + %tmp3 = mul i32 %tmp2, 255 + %val = bitcast i32 %tmp3 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sshort_mul_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sshort_mul_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = sext i16 %tmp to i32 + %tmp3 = mul i32 %tmp2, 255 + %val = bitcast i32 %tmp3 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sbyte_type_check: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_bfe_i32 v{{[0-9]}}, v{{[0-9]}}, 0, 5 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sbyte_type_check(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = zext i8 %tmp to i32 + %tmp3 = shl i32 %tmp2, 27 + %tmp4 = ashr i32 %tmp3, 27 + %val = bitcast i32 %tmp4 to float + ret float %val +} + declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 +declare i8 @llvm.amdgcn.buffer.load.i8(<4 x i32>, i32, i32, i1, i1) #0 +declare i16 @llvm.amdgcn.buffer.load.i16(<4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll index 7e2996e..5210414 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll @@ -233,9 +233,35 @@ define amdgpu_ps void @buffer_store_x3_offset_merged3(<4 x i32> inreg %rsrc, <2 ret void } +;CHECK-LABEL: {{^}}buffer_store_byte: +;CHECK-NOT: s_waitcnt +;CHECK-NEXT: %bb. +;CHECK: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 offset:8 +define amdgpu_ps void @buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i8 + call void @llvm.amdgcn.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_short: +;CHECK-NOT: s_waitcnt +;CHECK-NEXT: %bb. +;CHECK: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 offset:16 +define amdgpu_ps void @buffer_store_short(<4 x i32> inreg %rsrc, float %v1) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i16 + call void @llvm.amdgcn.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + ret void +} + declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.i8(i8, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.i16(i16, <4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll index 33b2967..cd64420 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -263,6 +263,62 @@ main_body: ret {<4 x float>, <2 x float>, float} %r2 } +;CHECK-LABEL: {{^}}raw_buffer_load_ubyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 +;CHECK: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @raw_buffer_load_ubyte(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp2 = zext i8 %tmp to i32 + %val = uitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}raw_buffer_load_ushort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @raw_buffer_load_ushort(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp2 = zext i16 %tmp to i32 + %val = uitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}raw_buffer_load_sbyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @raw_buffer_load_sbyte(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp2 = sext i8 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}raw_buffer_load_sshort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @raw_buffer_load_sshort(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp2 = sext i16 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 @@ -270,5 +326,7 @@ declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #0 declare <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32>, i32, i32, i32) #0 declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32) #0 +declare i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32>, i32, i32, i32) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll index 4f39867e4..89728f2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -189,6 +189,32 @@ main_body: ret void } +;CHECK-LABEL: {{^}}raw_buffer_store_byte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} +;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 +;CHECK-NEXT: s_endpgm +define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i8 + call void @llvm.amdgcn.raw.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_store_short: +;CHECK-NEXT: %bb. +;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} +;CHECK-NEXT: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 +;CHECK-NEXT: s_endpgm +define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i16 + call void @llvm.amdgcn.raw.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0 @@ -196,6 +222,8 @@ declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #1 +declare void @llvm.amdgcn.raw.buffer.store.i8(i8, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.i16(i16, <4 x i32>, i32, i32, i32) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll index 5484c8b..4ac3428 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll @@ -144,6 +144,62 @@ main_body: ret {<4 x float>, <2 x float>, float} %r2 } +;CHECK-LABEL: {{^}}struct_buffer_load_ubyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @struct_buffer_load_ubyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %tmp2 = zext i8 %tmp to i32 + %val = uitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}struct_buffer_load_ushort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @struct_buffer_load_ushort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %tmp2 = zext i16 %tmp to i32 + %val = uitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}struct_buffer_load_sbyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @struct_buffer_load_sbyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %tmp2 = sext i8 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}struct_buffer_load_sshort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @struct_buffer_load_sshort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %tmp2 = sext i16 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #0 @@ -151,5 +207,7 @@ declare i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32>, i32, i32, i32, i32) # declare <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(<4 x i32>, i32, i32, i32, i32) #0 declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32>, i32, i32, i32, i32) #0 +declare i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32>, i32, i32, i32, i32) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll index 738bb16..826030c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll @@ -108,6 +108,32 @@ main_body: ret void } +;CHECK-LABEL: {{^}}struct_buffer_store_byte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} +;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen +;CHECK-NEXT: s_endpgm +define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i8 + call void @llvm.amdgcn.struct.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_store_short: +;CHECK-NEXT: %bb. +;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} +;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen +;CHECK-NEXT: s_endpgm +define amdgpu_ps void @struct_buffer_store_short(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i16 + call void @llvm.amdgcn.struct.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 @@ -115,6 +141,8 @@ declare void @llvm.amdgcn.struct.buffer.store.i32(i32, <4 x i32>, i32, i32, i32, declare void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #1 +declare void @llvm.amdgcn.struct.buffer.store.i8(i8, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.i16(i16, <4 x i32>, i32, i32, i32, i32) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } -- 2.7.4