From 9c928649a085646c4c779bac095643b50b464d83 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 17 Jan 2020 15:40:15 -0500
Subject: [PATCH] AMDGPU: Fix interaction of tfe and d16

This using the wrong result register, and dropping the result entirely
for v2f16. This would fail to select on the scalar case. I believe it
was also mishandling packed/unpacked subtargets.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp      | 120 ++++----
 llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll | 410 +++++++++++++++++++++++++
 2 files changed, 469 insertions(+), 61 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 24f099d..4c68397 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5215,6 +5215,24 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
   return Value == 0;
 }
 
+static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
+                              SDValue Src, int ExtraElts) {
+  EVT SrcVT = Src.getValueType();
+
+  SmallVector<SDValue, 8> Elts;
+
+  if (SrcVT.isVector())
+    DAG.ExtractVectorElements(Src, Elts);
+  else
+    Elts.push_back(Src);
+
+  SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
+  while (ExtraElts--)
+    Elts.push_back(Undef);
+
+  return DAG.getBuildVector(CastVT, DL, Elts);
+}
+
 // Re-construct the required return value for a image load intrinsic.
 // This is more complicated due to the optional use TexFailCtrl which means the required
 // return type is an aggregate
@@ -5226,76 +5244,56 @@ static SDValue constructRetValue(SelectionDAG &DAG,
                                  const SDLoc &DL, LLVMContext &Context) {
   // Determine the required return type. This is the same regardless of IsTexFail flag
   EVT ReqRetVT = ResultTypes[0];
-  EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
   int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
-  EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
-  EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
-                                           : AdjEltVT
-                       : ReqRetVT;
-
-  // Extract data part of the result
-  // Bitcast the result to the same type as the required return type
-  int NumElts;
-  if (IsD16 && !Unpacked)
-    NumElts = NumVDataDwords << 1;
-  else
-    NumElts = NumVDataDwords;
+  int NumDataDwords = (!IsD16 || (IsD16 && Unpacked)) ?
+    ReqRetNumElts : (ReqRetNumElts + 1) / 2;
 
-  EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
-                           : AdjEltVT;
+  int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
+    DMaskPop : (DMaskPop + 1) / 2;
 
-  // Special case for v6f16. Rather than add support for this, use v3i32 to
-  // extract the data elements
-  bool V6F16Special = false;
-  if (NumElts == 6) {
-    CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2);
-    DMaskPop >>= 1;
-    ReqRetNumElts >>= 1;
-    V6F16Special = true;
-    AdjVT = MVT::v2i32;
-  }
+  MVT DataDwordVT = NumDataDwords == 1 ?
+    MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
 
-  SDValue N = SDValue(Result, 0);
-  SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
+  MVT MaskPopVT = MaskPopDwords == 1 ?
+    MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
 
-  // Iterate over the result
-  SmallVector<SDValue, 4> BVElts;
+  SDValue Data(Result, 0);
+  SDValue TexFail;
 
-  if (CastVT.isVector()) {
-    DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
-  } else {
-    BVElts.push_back(CastRes);
-  }
-  int ExtraElts = ReqRetNumElts - DMaskPop;
-  while(ExtraElts--)
-    BVElts.push_back(DAG.getUNDEF(AdjEltVT));
+  if (IsTexFail) {
+    SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
+    if (MaskPopVT.isVector()) {
+      Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
+                         SDValue(Result, 0), ZeroIdx);
+    } else {
+      Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
+                         SDValue(Result, 0), ZeroIdx);
+    }
 
-  SDValue PreTFCRes;
-  if (ReqRetNumElts > 1) {
-    SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
-    if (IsD16 && Unpacked)
-      PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
-    else
-      PreTFCRes = NewVec;
-  } else {
-    PreTFCRes = BVElts[0];
+    TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+                          SDValue(Result, 0),
+                          DAG.getConstant(MaskPopDwords, DL, MVT::i32));
   }
 
-  if (V6F16Special)
-    PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
+  if (DataDwordVT.isVector())
+    Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
+                          NumDataDwords - MaskPopDwords);
 
-  if (!IsTexFail) {
-    if (Result->getNumValues() > 1)
-      return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
-    else
-      return PreTFCRes;
-  }
+  if (IsD16)
+    Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
+
+  if (!ReqRetVT.isVector())
+    Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
+
+  Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data);
+
+  if (TexFail)
+    return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
+
+  if (Result->getNumValues() == 1)
+    return Data;
 
-  // Extract the TexFail result and insert into aggregate return
-  SmallVector<SDValue, 1> TFCElt;
-  DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
-  SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
-  return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
+  return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
 }
 
 static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
@@ -5545,8 +5543,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     }
 
     EVT NewVT = NumVDataDwords > 1 ?
-                  EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
-                : MVT::f32;
+                  EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
+                : MVT::i32;
 
     ResultTypes[0] = NewVT;
     if (ResultTypes.size() == 3) {
diff --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
new file mode 100644
index 0000000..30e4a1e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
@@ -0,0 +1,410 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-UNPACKED %s
+
+define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX9-LABEL: load_1d_f16_tfe_dmask0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_mov_b32 s11, s9
+; GFX9-NEXT:    s_mov_b32 s10, s8
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s7, s5
+; GFX9-NEXT:    s_mov_b32 s6, s4
+; GFX9-NEXT:    s_mov_b32 s5, s3
+; GFX9-NEXT:    s_mov_b32 s4, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_short v[0:1], v1, off
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_1d_f16_tfe_dmask0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_mov_b32 s11, s9
+; GFX10-NEXT:    s_mov_b32 s10, s8
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_mov_b32 s7, s5
+; GFX10-NEXT:    s_mov_b32 s6, s4
+; GFX10-NEXT:    s_mov_b32 s5, s3
+; GFX10-NEXT:    s_mov_b32 s4, s2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_short v[0:1], v1, off
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask0:
+; GFX8-UNPACKED:       ; %bb.0:
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s11, s9
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s10, s8
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s9, s7
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s8, s6
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s7, s5
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s6, s4
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s5, s3
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s4, s2
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-UNPACKED-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-UNPACKED-NEXT:    flat_store_short v[0:1], v1
+; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-UNPACKED-NEXT:    s_endpgm
+  %v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.data = extractvalue { half, i32 } %v, 0
+  %v.err = extractvalue { half, i32 } %v, 1
+  store volatile half %v.data, half addrspace(1)* undef
+  store volatile i32 %v.err, i32 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX9-LABEL: load_1d_f16_tfe_dmask1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_mov_b32 s11, s9
+; GFX9-NEXT:    s_mov_b32 s10, s8
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s7, s5
+; GFX9-NEXT:    s_mov_b32 s6, s4
+; GFX9-NEXT:    s_mov_b32 s5, s3
+; GFX9-NEXT:    s_mov_b32 s4, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_short v[0:1], v1, off
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_1d_f16_tfe_dmask1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_mov_b32 s11, s9
+; GFX10-NEXT:    s_mov_b32 s10, s8
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_mov_b32 s7, s5
+; GFX10-NEXT:    s_mov_b32 s6, s4
+; GFX10-NEXT:    s_mov_b32 s5, s3
+; GFX10-NEXT:    s_mov_b32 s4, s2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_short v[0:1], v1, off
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask1:
+; GFX8-UNPACKED:       ; %bb.0:
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s11, s9
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s10, s8
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s9, s7
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s8, s6
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s7, s5
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s6, s4
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s5, s3
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s4, s2
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-UNPACKED-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-UNPACKED-NEXT:    flat_store_short v[0:1], v1
+; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-UNPACKED-NEXT:    s_endpgm
+  %v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.data = extractvalue { half, i32 } %v, 0
+  %v.err = extractvalue { half, i32 } %v, 1
+  store volatile half %v.data, half addrspace(1)* undef
+  store volatile i32 %v.err, i32 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX9-LABEL: load_1d_v2f16_tfe_dmask0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_mov_b32 s11, s9
+; GFX9-NEXT:    s_mov_b32 s10, s8
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s7, s5
+; GFX9-NEXT:    s_mov_b32 s6, s4
+; GFX9-NEXT:    s_mov_b32 s5, s3
+; GFX9-NEXT:    s_mov_b32 s4, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[0:1], v1, off
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_1d_v2f16_tfe_dmask0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_mov_b32 s11, s9
+; GFX10-NEXT:    s_mov_b32 s10, s8
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_mov_b32 s7, s5
+; GFX10-NEXT:    s_mov_b32 s6, s4
+; GFX10-NEXT:    s_mov_b32 s5, s3
+; GFX10-NEXT:    s_mov_b32 s4, s2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[0:1], v1, off
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask0:
+; GFX8-UNPACKED:       ; %bb.0:
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s11, s9
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s10, s8
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s9, s7
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s8, s6
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s7, s5
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s6, s4
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s5, s3
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s4, s2
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-UNPACKED-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v1
+; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-UNPACKED-NEXT:    s_endpgm
+  %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.data = extractvalue { <2 x half>, i32 } %v, 0
+  %v.err = extractvalue { <2 x half>, i32 } %v, 1
+  store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
+  store volatile i32 %v.err, i32 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX9-LABEL: load_1d_v2f16_tfe_dmask1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_mov_b32 s11, s9
+; GFX9-NEXT:    s_mov_b32 s10, s8
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s7, s5
+; GFX9-NEXT:    s_mov_b32 s6, s4
+; GFX9-NEXT:    s_mov_b32 s5, s3
+; GFX9-NEXT:    s_mov_b32 s4, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[0:1], v1, off
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_1d_v2f16_tfe_dmask1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_mov_b32 s11, s9
+; GFX10-NEXT:    s_mov_b32 s10, s8
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_mov_b32 s7, s5
+; GFX10-NEXT:    s_mov_b32 s6, s4
+; GFX10-NEXT:    s_mov_b32 s5, s3
+; GFX10-NEXT:    s_mov_b32 s4, s2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[0:1], v1, off
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask1:
+; GFX8-UNPACKED:       ; %bb.0:
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s11, s9
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s10, s8
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s9, s7
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s8, s6
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s7, s5
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s6, s4
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s5, s3
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s4, s2
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-UNPACKED-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v1
+; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-UNPACKED-NEXT:    s_endpgm
+  %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.data = extractvalue { <2 x half>, i32 } %v, 0
+  %v.err = extractvalue { <2 x half>, i32 } %v, 1
+  store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
+  store volatile i32 %v.err, i32 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX9-LABEL: load_1d_v2f16_tfe_dmask3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_mov_b32 s11, s9
+; GFX9-NEXT:    s_mov_b32 s10, s8
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s7, s5
+; GFX9-NEXT:    s_mov_b32 s6, s4
+; GFX9-NEXT:    s_mov_b32 s5, s3
+; GFX9-NEXT:    s_mov_b32 s4, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x3 unorm tfe d16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[0:1], v1, off
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_1d_v2f16_tfe_dmask3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_mov_b32 s11, s9
+; GFX10-NEXT:    s_mov_b32 s10, s8
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_mov_b32 s7, s5
+; GFX10-NEXT:    s_mov_b32 s6, s4
+; GFX10-NEXT:    s_mov_b32 s5, s3
+; GFX10-NEXT:    s_mov_b32 s4, s2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[0:1], v1, off
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask3:
+; GFX8-UNPACKED:       ; %bb.0:
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s11, s9
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s10, s8
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s9, s7
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s8, s6
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s7, s5
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s6, s4
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s5, s3
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s4, s2
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-UNPACKED-NEXT:    image_load v[1:3], v0, s[4:11] dmask:0x3 unorm tfe d16
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-UNPACKED-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX8-UNPACKED-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v0
+; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v3
+; GFX8-UNPACKED-NEXT:    s_endpgm
+  %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.data = extractvalue { <2 x half>, i32 } %v, 0
+  %v.err = extractvalue { <2 x half>, i32 } %v, 1
+  store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
+  store volatile i32 %v.err, i32 addrspace(1)* undef
+  ret void
+}
+
+; define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
+;   %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+;   %v.data = extractvalue { <3 x half>, i32 } %v, 0
+;   %v.err = extractvalue { <3 x half>, i32 } %v, 1
+;   store volatile <3 x half> %v.data, <3 x half> addrspace(1)* undef
+;   store volatile i32 %v.err, i32 addrspace(1)* undef
+;   ret void
+; }
+
+define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX9-LABEL: load_1d_v4f16_tfe_dmask15:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_mov_b32 s11, s9
+; GFX9-NEXT:    s_mov_b32 s10, s8
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s7, s5
+; GFX9-NEXT:    s_mov_b32 s6, s4
+; GFX9-NEXT:    s_mov_b32 s5, s3
+; GFX9-NEXT:    s_mov_b32 s4, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    image_load v[1:3], v0, s[4:11] dmask:0xf unorm tfe d16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[1:2], off
+; GFX9-NEXT:    global_store_dword v[0:1], v3, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_1d_v4f16_tfe_dmask15:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_mov_b32 s11, s9
+; GFX10-NEXT:    s_mov_b32 s10, s8
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_mov_b32 s7, s5
+; GFX10-NEXT:    s_mov_b32 s6, s4
+; GFX10-NEXT:    s_mov_b32 s5, s3
+; GFX10-NEXT:    s_mov_b32 s4, s2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    image_load v[1:3], v0, s[4:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe d16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[1:2], off
+; GFX10-NEXT:    global_store_dword v[0:1], v3, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask15:
+; GFX8-UNPACKED:       ; %bb.0:
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s11, s9
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s10, s8
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s9, s7
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s8, s6
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s7, s5
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s6, s4
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s5, s3
+; GFX8-UNPACKED-NEXT:    s_mov_b32 s4, s2
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-UNPACKED-NEXT:    image_load v[1:5], v0, s[4:11] dmask:0xf unorm tfe d16
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-UNPACKED-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-UNPACKED-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-UNPACKED-NEXT:    v_or_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-UNPACKED-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-UNPACKED-NEXT:    flat_store_dwordx2 v[0:1], v[1:2]
+; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v5
+; GFX8-UNPACKED-NEXT:    s_endpgm
+  %v = call { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.data = extractvalue { <4 x half>, i32 } %v, 0
+  %v.err = extractvalue { <4 x half>, i32 } %v, 1
+  store volatile <4 x half> %v.data, <4 x half> addrspace(1)* undef
+  store volatile i32 %v.err, i32 addrspace(1)* undef
+  ret void
+}
+
+declare { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+declare { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+declare { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+declare { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+
+attributes #0 = { nounwind readonly }
-- 
2.7.4