From 9c928649a085646c4c779bac095643b50b464d83 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 17 Jan 2020 15:40:15 -0500 Subject: [PATCH] AMDGPU: Fix interaction of tfe and d16 This using the wrong result register, and dropping the result entirely for v2f16. This would fail to select on the scalar case. I believe it was also mishandling packed/unpacked subtargets. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 120 ++++---- llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll | 410 +++++++++++++++++++++++++ 2 files changed, 469 insertions(+), 61 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 24f099d..4c68397 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5215,6 +5215,24 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, return Value == 0; } +static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, + SDValue Src, int ExtraElts) { + EVT SrcVT = Src.getValueType(); + + SmallVector Elts; + + if (SrcVT.isVector()) + DAG.ExtractVectorElements(Src, Elts); + else + Elts.push_back(Src); + + SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType()); + while (ExtraElts--) + Elts.push_back(Undef); + + return DAG.getBuildVector(CastVT, DL, Elts); +} + // Re-construct the required return value for a image load intrinsic. // This is more complicated due to the optional use TexFailCtrl which means the required // return type is an aggregate @@ -5226,76 +5244,56 @@ static SDValue constructRetValue(SelectionDAG &DAG, const SDLoc &DL, LLVMContext &Context) { // Determine the required return type. This is the same regardless of IsTexFail flag EVT ReqRetVT = ResultTypes[0]; - EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT; int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1; - EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT; - EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts) - : AdjEltVT - : ReqRetVT; - - // Extract data part of the result - // Bitcast the result to the same type as the required return type - int NumElts; - if (IsD16 && !Unpacked) - NumElts = NumVDataDwords << 1; - else - NumElts = NumVDataDwords; + int NumDataDwords = (!IsD16 || (IsD16 && Unpacked)) ? + ReqRetNumElts : (ReqRetNumElts + 1) / 2; - EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts) - : AdjEltVT; + int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ? + DMaskPop : (DMaskPop + 1) / 2; - // Special case for v6f16. Rather than add support for this, use v3i32 to - // extract the data elements - bool V6F16Special = false; - if (NumElts == 6) { - CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2); - DMaskPop >>= 1; - ReqRetNumElts >>= 1; - V6F16Special = true; - AdjVT = MVT::v2i32; - } + MVT DataDwordVT = NumDataDwords == 1 ? + MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords); - SDValue N = SDValue(Result, 0); - SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N); + MVT MaskPopVT = MaskPopDwords == 1 ? + MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords); - // Iterate over the result - SmallVector BVElts; + SDValue Data(Result, 0); + SDValue TexFail; - if (CastVT.isVector()) { - DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop); - } else { - BVElts.push_back(CastRes); - } - int ExtraElts = ReqRetNumElts - DMaskPop; - while(ExtraElts--) - BVElts.push_back(DAG.getUNDEF(AdjEltVT)); + if (IsTexFail) { + SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32); + if (MaskPopVT.isVector()) { + Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT, + SDValue(Result, 0), ZeroIdx); + } else { + Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT, + SDValue(Result, 0), ZeroIdx); + } - SDValue PreTFCRes; - if (ReqRetNumElts > 1) { - SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts); - if (IsD16 && Unpacked) - PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked); - else - PreTFCRes = NewVec; - } else { - PreTFCRes = BVElts[0]; + TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + SDValue(Result, 0), + DAG.getConstant(MaskPopDwords, DL, MVT::i32)); } - if (V6F16Special) - PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes); + if (DataDwordVT.isVector()) + Data = padEltsToUndef(DAG, DL, DataDwordVT, Data, + NumDataDwords - MaskPopDwords); - if (!IsTexFail) { - if (Result->getNumValues() > 1) - return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL); - else - return PreTFCRes; - } + if (IsD16) + Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked); + + if (!ReqRetVT.isVector()) + Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data); + + Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data); + + if (TexFail) + return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL); + + if (Result->getNumValues() == 1) + return Data; - // Extract the TexFail result and insert into aggregate return - SmallVector TFCElt; - DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1); - SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]); - return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL); + return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL); } static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, @@ -5545,8 +5543,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } EVT NewVT = NumVDataDwords > 1 ? - EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords) - : MVT::f32; + EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords) + : MVT::i32; ResultTypes[0] = NewVT; if (ResultTypes.size() == 3) { diff --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll new file mode 100644 index 0000000..30e4a1e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll @@ -0,0 +1,410 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-UNPACKED %s + +define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) { +; GFX9-LABEL: load_1d_f16_tfe_dmask0: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s7, s5 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s5, s3 +; GFX9-NEXT: s_mov_b32 s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_short v[0:1], v1, off +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: load_1d_f16_tfe_dmask0: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_mov_b32 s11, s9 +; GFX10-NEXT: s_mov_b32 s10, s8 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s7, s5 +; GFX10-NEXT: s_mov_b32 s6, s4 +; GFX10-NEXT: s_mov_b32 s5, s3 +; GFX10-NEXT: s_mov_b32 s4, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_short v[0:1], v1, off +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask0: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9 +; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1 +; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2 +; GFX8-UNPACKED-NEXT: s_endpgm + %v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.data = extractvalue { half, i32 } %v, 0 + %v.err = extractvalue { half, i32 } %v, 1 + store volatile half %v.data, half addrspace(1)* undef + store volatile i32 %v.err, i32 addrspace(1)* undef + ret void +} + +define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) { +; GFX9-LABEL: load_1d_f16_tfe_dmask1: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s7, s5 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s5, s3 +; GFX9-NEXT: s_mov_b32 s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_short v[0:1], v1, off +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: load_1d_f16_tfe_dmask1: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_mov_b32 s11, s9 +; GFX10-NEXT: s_mov_b32 s10, s8 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s7, s5 +; GFX10-NEXT: s_mov_b32 s6, s4 +; GFX10-NEXT: s_mov_b32 s5, s3 +; GFX10-NEXT: s_mov_b32 s4, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_short v[0:1], v1, off +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask1: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9 +; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1 +; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2 +; GFX8-UNPACKED-NEXT: s_endpgm + %v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.data = extractvalue { half, i32 } %v, 0 + %v.err = extractvalue { half, i32 } %v, 1 + store volatile half %v.data, half addrspace(1)* undef + store volatile i32 %v.err, i32 addrspace(1)* undef + ret void +} + +define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) { +; GFX9-LABEL: load_1d_v2f16_tfe_dmask0: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s7, s5 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s5, s3 +; GFX9-NEXT: s_mov_b32 s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: load_1d_v2f16_tfe_dmask0: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_mov_b32 s11, s9 +; GFX10-NEXT: s_mov_b32 s10, s8 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s7, s5 +; GFX10-NEXT: s_mov_b32 s6, s4 +; GFX10-NEXT: s_mov_b32 s5, s3 +; GFX10-NEXT: s_mov_b32 s4, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask0: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9 +; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1 +; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2 +; GFX8-UNPACKED-NEXT: s_endpgm + %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.data = extractvalue { <2 x half>, i32 } %v, 0 + %v.err = extractvalue { <2 x half>, i32 } %v, 1 + store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef + store volatile i32 %v.err, i32 addrspace(1)* undef + ret void +} + +define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) { +; GFX9-LABEL: load_1d_v2f16_tfe_dmask1: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s7, s5 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s5, s3 +; GFX9-NEXT: s_mov_b32 s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: load_1d_v2f16_tfe_dmask1: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_mov_b32 s11, s9 +; GFX10-NEXT: s_mov_b32 s10, s8 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s7, s5 +; GFX10-NEXT: s_mov_b32 s6, s4 +; GFX10-NEXT: s_mov_b32 s5, s3 +; GFX10-NEXT: s_mov_b32 s4, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask1: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9 +; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1 +; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2 +; GFX8-UNPACKED-NEXT: s_endpgm + %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.data = extractvalue { <2 x half>, i32 } %v, 0 + %v.err = extractvalue { <2 x half>, i32 } %v, 1 + store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef + store volatile i32 %v.err, i32 addrspace(1)* undef + ret void +} + +define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) { +; GFX9-LABEL: load_1d_v2f16_tfe_dmask3: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s7, s5 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s5, s3 +; GFX9-NEXT: s_mov_b32 s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 unorm tfe d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: load_1d_v2f16_tfe_dmask3: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_mov_b32 s11, s9 +; GFX10-NEXT: s_mov_b32 s10, s8 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s7, s5 +; GFX10-NEXT: s_mov_b32 s6, s4 +; GFX10-NEXT: s_mov_b32 s5, s3 +; GFX10-NEXT: s_mov_b32 s4, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask3: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9 +; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-UNPACKED-NEXT: image_load v[1:3], v0, s[4:11] dmask:0x3 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0 +; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v3 +; GFX8-UNPACKED-NEXT: s_endpgm + %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.data = extractvalue { <2 x half>, i32 } %v, 0 + %v.err = extractvalue { <2 x half>, i32 } %v, 1 + store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef + store volatile i32 %v.err, i32 addrspace(1)* undef + ret void +} + +; define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) { +; %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) +; %v.data = extractvalue { <3 x half>, i32 } %v, 0 +; %v.err = extractvalue { <3 x half>, i32 } %v, 1 +; store volatile <3 x half> %v.data, <3 x half> addrspace(1)* undef +; store volatile i32 %v.err, i32 addrspace(1)* undef +; ret void +; } + +define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s) { +; GFX9-LABEL: load_1d_v4f16_tfe_dmask15: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s7, s5 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s5, s3 +; GFX9-NEXT: s_mov_b32 s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf unorm tfe d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[1:2], off +; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: load_1d_v4f16_tfe_dmask15: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_mov_b32 s11, s9 +; GFX10-NEXT: s_mov_b32 s10, s8 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s7, s5 +; GFX10-NEXT: s_mov_b32 s6, s4 +; GFX10-NEXT: s_mov_b32 s5, s3 +; GFX10-NEXT: s_mov_b32 s4, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[1:2], off +; GFX10-NEXT: global_store_dword v[0:1], v3, off +; GFX10-NEXT: s_endpgm +; +; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask15: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9 +; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-UNPACKED-NEXT: image_load v[1:5], v0, s[4:11] dmask:0xf unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-UNPACKED-NEXT: flat_store_dwordx2 v[0:1], v[1:2] +; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v5 +; GFX8-UNPACKED-NEXT: s_endpgm + %v = call { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.data = extractvalue { <4 x half>, i32 } %v, 0 + %v.err = extractvalue { <4 x half>, i32 } %v, 1 + store volatile <4 x half> %v.data, <4 x half> addrspace(1)* undef + store volatile i32 %v.err, i32 addrspace(1)* undef + ret void +} + +declare { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 + +attributes #0 = { nounwind readonly } -- 2.7.4