From 0e0a1c80fb12e93fc92107346c9b65473ac261ef Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 5 Aug 2019 14:57:59 +0000 Subject: [PATCH] AMDGPU: Correct behavior of f16/i16 non-format store intrinsics This was switching to use a format store for a non-format store for f16 types. Also fixes i16/f16 stores on targets without legal f16. The corresponding loads also need to be fixed. llvm-svn: 367872 --- llvm/lib/Target/AMDGPU/BUFInstructions.td | 4 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 89 ++++++++++++++-------- .../CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll | 66 ++++++++++++++++ .../AMDGPU/llvm.amdgcn.struct.buffer.store.ll | 53 ++++++++++++- 4 files changed, 180 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 38f427a..504d391 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1256,8 +1256,12 @@ let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index db807b8..6556866 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -218,28 +218,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); - - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); - - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); - setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Expand); @@ -688,6 +666,30 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, VT, Custom); } + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); + + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); + + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); + setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::ADDCARRY); setTargetDAGCombine(ISD::SUB); @@ -6877,10 +6879,22 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, case Intrinsic::amdgcn_raw_buffer_store: case Intrinsic::amdgcn_raw_buffer_store_format: { + const bool IsFormat = + IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format; + SDValue VData = Op.getOperand(2); - bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + EVT VDataVT = VData.getValueType(); + EVT EltType = VDataVT.getScalarType(); + bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); if (IsD16) VData = handleD16VData(VData, DAG); + + if (!isTypeLegal(VDataVT)) { + VData = + DAG.getNode(ISD::BITCAST, DL, + getEquivalentMemType(*DAG.getContext(), VDataVT), VData); + } + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { Chain, @@ -6893,15 +6907,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(6), // cachepolicy DAG.getConstant(0, DL, MVT::i1), // idxen }; - unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ? - AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; + unsigned Opc = + IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics - EVT VDataType = VData.getValueType().getScalarType(); - if (VDataType == MVT::i8 || VDataType == MVT::i16) - return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) + return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M); return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); @@ -6909,10 +6922,23 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, case Intrinsic::amdgcn_struct_buffer_store: case Intrinsic::amdgcn_struct_buffer_store_format: { + const bool IsFormat = + IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format; + SDValue VData = Op.getOperand(2); - bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + EVT VDataVT = VData.getValueType(); + EVT EltType = VDataVT.getScalarType(); + bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); + if (IsD16) VData = handleD16VData(VData, DAG); + + if (!isTypeLegal(VDataVT)) { + VData = + DAG.getNode(ISD::BITCAST, DL, + getEquivalentMemType(*DAG.getContext(), VDataVT), VData); + } + auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); SDValue Ops[] = { Chain, @@ -6932,7 +6958,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics EVT VDataType = VData.getValueType().getScalarType(); - if (VDataType == MVT::i8 || VDataType == MVT::i16) + if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, @@ -7107,6 +7133,9 @@ SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType, SDLoc DL, SDValue Ops[], MemSDNode *M) const { + if (VDataType == MVT::f16) + Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]); + SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]); Ops[1] = BufferStoreExt; unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE : diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll index 89728f2..7d1a5a3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -215,6 +215,67 @@ main_body: ret void } +;CHECK-LABEL: {{^}}raw_buffer_store_f16: +;CHECK-NEXT: %bb. +;CHECK-NOT: v0 +;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 +;CHECK-NEXT: s_endpgm +define amdgpu_ps void @raw_buffer_store_f16(<4 x i32> inreg %rsrc, i32 %v1) { +main_body: + %trunc = trunc i32 %v1 to i16 + %cast = bitcast i16 %trunc to half + call void @llvm.amdgcn.raw.buffer.store.f16(half %cast, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_v2f16: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %offset) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_v4f16: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %data, i32 %offset) #0 { +main_body: + call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_store_i16: +;CHECK-NEXT: %bb. +;CHECK-NOT: v0 +;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 +;CHECK-NEXT: s_endpgm +define amdgpu_ps void @raw_buffer_store_i16(<4 x i32> inreg %rsrc, i32 %v1) { +main_body: + %trunc = trunc i32 %v1 to i16 + call void @llvm.amdgcn.raw.buffer.store.i16(i16 %trunc, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_v2i16: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_v2i16(<4 x i32> inreg %rsrc, <2 x i16> %data, i32 %offset) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_v4i16: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %data, i32 %offset) #0 { +main_body: + call void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0 @@ -223,7 +284,12 @@ declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #1 declare void @llvm.amdgcn.raw.buffer.store.i8(i8, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.f16(half, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.i16(i16, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16>, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16>, <4 x i32>, i32, i32, i32) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll index 826030c..4c629e2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll @@ -121,12 +121,39 @@ main_body: ret void } -;CHECK-LABEL: {{^}}struct_buffer_store_short: +;CHECK-LABEL: {{^}}struct_buffer_store_f16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: v_cvt_f16_f32_e32 v{{[0-9]}}, v{{[0-9]}} +;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen +;CHECK-NEXT: s_endpgm +define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1, i32 %index) { + %v2 = fptrunc float %v1 to half + call void @llvm.amdgcn.struct.buffer.store.f16(half %v2, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_store_v2f16: +;CHECK-NEXT: %bb. +;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen +define amdgpu_ps void @struct_buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %v1, i32 %index) { + call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_store_v4f16: +;CHECK-NEXT: %bb. +;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen +define amdgpu_ps void @struct_buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %v1, i32 %index) { + call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_store_i16: ;CHECK-NEXT: %bb. ;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} ;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen ;CHECK-NEXT: s_endpgm -define amdgpu_ps void @struct_buffer_store_short(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +define amdgpu_ps void @struct_buffer_store_i16(<4 x i32> inreg %rsrc, float %v1, i32 %index) { main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i16 @@ -134,6 +161,22 @@ main_body: ret void } +;CHECK-LABEL: {{^}}struct_buffer_store_vif16: +;CHECK-NEXT: %bb. +;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen +define amdgpu_ps void @struct_buffer_store_vif16(<4 x i32> inreg %rsrc, <2 x i16> %v1, i32 %index) { + call void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_store_v4i16: +;CHECK-NEXT: %bb. +;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen +define amdgpu_ps void @struct_buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %v1, i32 %index) { + call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 @@ -143,6 +186,12 @@ declare void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #1 declare void @llvm.amdgcn.struct.buffer.store.i8(i8, <4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.buffer.store.i16(i16, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.f16(half, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) #0 + attributes #0 = { nounwind } attributes #1 = { nounwind readonly } -- 2.7.4