From 9e1d2afc13c291fed0610bdfe89eb678890f7b88 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 25 Feb 2020 14:45:07 -0500 Subject: [PATCH] AMDGPU/GlobalISel: Don't use vector G_EXTRACT in arg lowering Create a wider source vector, and unmerge with dead defs like the legalizer. The legalization handling for G_EXTRACT is incomplete, and it's preferrable to keep everything in 32-bit pieces. We should probably start moving these functions into utils, since we have a growing number of places that do almost the same thing. --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 54 +++++++++++---- .../GlobalISel/irtranslator-function-args.ll | 79 ++++++++++++++++++---- .../legalize-llvm.amdgcn.image.store.2d.d16.ll | 22 +++--- 3 files changed, 118 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index afbd89b..ec0f38a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -480,6 +480,43 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( return true; } +/// Pack values \p SrcRegs to cover the vector type result \p DstRegs. +static MachineInstrBuilder mergeVectorRegsToResultRegs( + MachineIRBuilder &B, ArrayRef DstRegs, ArrayRef SrcRegs) { + MachineRegisterInfo &MRI = *B.getMRI(); + LLT LLTy = MRI.getType(DstRegs[0]); + LLT PartLLT = MRI.getType(SrcRegs[0]); + + // Deal with v3s16 split into v2s16 + LLT LCMTy = getLCMType(LLTy, PartLLT); + if (LCMTy == LLTy) { + // Common case where no padding is needed. + assert(DstRegs.size() == 1); + return B.buildConcatVectors(DstRegs[0], SrcRegs); + } + + const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits(); + Register Undef = B.buildUndef(PartLLT).getReg(0); + + // Build vector of undefs. + SmallVector WidenedSrcs(NumWide, Undef); + + // Replace the first sources with the real registers. + std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin()); + + auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs); + int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits(); + + SmallVector PadDstRegs(NumDst); + std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin()); + + // Create the excess dead defs for the unmerge. + for (int I = DstRegs.size(); I != NumDst; ++I) + PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy); + + return B.buildUnmerge(PadDstRegs, Widened); +} + // TODO: Move this to generic code static void packSplitRegsToOrigType(MachineIRBuilder &B, ArrayRef OrigRegs, @@ -492,22 +529,9 @@ static void packSplitRegsToOrigType(MachineIRBuilder &B, } if (LLTy.isVector() && PartLLT.isVector()) { + assert(OrigRegs.size() == 1); assert(LLTy.getElementType() == PartLLT.getElementType()); - - int DstElts = LLTy.getNumElements(); - int PartElts = PartLLT.getNumElements(); - if (DstElts % PartElts == 0) - B.buildConcatVectors(OrigRegs[0], Regs); - else { - // Deal with v3s16 split into v2s16 - assert(PartElts == 2 && DstElts % 2 != 0); - int RoundedElts = PartElts * ((DstElts + PartElts - 1) / PartElts); - - LLT RoundedDestTy = LLT::vector(RoundedElts, PartLLT.getElementType()); - auto RoundedConcat = B.buildConcatVectors(RoundedDestTy, Regs); - B.buildExtract(OrigRegs[0], RoundedConcat, 0); - } - + mergeVectorRegsToResultRegs(B, OrigRegs, Regs); return; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll index eeb4380..b058c6f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll @@ -814,10 +814,11 @@ define void @void_func_v3i16(<3 x i16> %arg0) #0 { ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK: G_STORE [[EXTRACT]](<3 x s16>), [[DEF]](p1) :: (store 6 into `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1) + ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; CHECK: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[UV]](<3 x s16>), [[DEF1]](p1) :: (store 6 into `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] ; CHECK: S_SETPC_B64_return [[COPY3]] store <3 x i16> %arg0, <3 x i16> addrspace(1)* undef @@ -848,10 +849,11 @@ define void @void_func_v5i16(<5 x i16> %arg0) #0 { ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>) - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<5 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK: G_STORE [[EXTRACT]](<5 x s16>), [[DEF]](p1) :: (store 10 into `<5 x i16> addrspace(1)* undef`, align 16, addrspace 1) + ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<5 x s16>), [[UV1:%[0-9]+]]:_(<5 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<10 x s16>) + ; CHECK: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[UV]](<5 x s16>), [[DEF1]](p1) :: (store 10 into `<5 x i16> addrspace(1)* undef`, align 16, addrspace 1) ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] ; CHECK: S_SETPC_B64_return [[COPY4]] store <5 x i16> %arg0, <5 x i16> addrspace(1)* undef @@ -898,6 +900,58 @@ define void @void_func_v16i16(<16 x i16> %arg0) #0 { ret void } +; <2 x i16> pieces that start spilling to the stack. +; FIXME: load of 2 would be sufficient for last piece +define void @void_func_v65i16(<65 x i16> %arg0) #0 { + ; CHECK-LABEL: name: void_func_v65i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; CHECK: [[COPY6:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr6 + ; CHECK: [[COPY7:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr7 + ; CHECK: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr8 + ; CHECK: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr9 + ; CHECK: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr10 + ; CHECK: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr11 + ; CHECK: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr12 + ; CHECK: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr13 + ; CHECK: [[COPY14:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr14 + ; CHECK: [[COPY15:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr15 + ; CHECK: [[COPY16:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr16 + ; CHECK: [[COPY17:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr17 + ; CHECK: [[COPY18:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr18 + ; CHECK: [[COPY19:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr19 + ; CHECK: [[COPY20:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr20 + ; CHECK: [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr21 + ; CHECK: [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr22 + ; CHECK: [[COPY23:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr23 + ; CHECK: [[COPY24:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr24 + ; CHECK: [[COPY25:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr25 + ; CHECK: [[COPY26:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr26 + ; CHECK: [[COPY27:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr27 + ; CHECK: [[COPY28:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr28 + ; CHECK: [[COPY29:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr29 + ; CHECK: [[COPY30:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr30 + ; CHECK: [[COPY31:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr31 + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.0, align 16, addrspace 5) + ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<130 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[COPY6]](<2 x s16>), [[COPY7]](<2 x s16>), [[COPY8]](<2 x s16>), [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[COPY12]](<2 x s16>), [[COPY13]](<2 x s16>), [[COPY14]](<2 x s16>), [[COPY15]](<2 x s16>), [[COPY16]](<2 x s16>), [[COPY17]](<2 x s16>), [[COPY18]](<2 x s16>), [[COPY19]](<2 x s16>), [[COPY20]](<2 x s16>), [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>), [[COPY23]](<2 x s16>), [[COPY24]](<2 x s16>), [[COPY25]](<2 x s16>), [[COPY26]](<2 x s16>), [[COPY27]](<2 x s16>), [[COPY28]](<2 x s16>), [[COPY29]](<2 x s16>), [[COPY30]](<2 x s16>), [[COPY31]](<2 x s16>), [[LOAD]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<65 x s16>), [[UV1:%[0-9]+]]:_(<65 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<130 x s16>) + ; CHECK: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[UV]](<65 x s16>), [[DEF1]](p1) :: (store 130 into `<65 x i16> addrspace(1)* undef`, align 256, addrspace 1) + ; CHECK: [[COPY33:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] + ; CHECK: S_SETPC_B64_return [[COPY33]] + store <65 x i16> %arg0, <65 x i16> addrspace(1)* undef + ret void +} + define void @void_func_v2f32(<2 x float> %arg0) #0 { ; CHECK-LABEL: name: void_func_v2f32 ; CHECK: bb.1 (%ir-block.0): @@ -1191,10 +1245,11 @@ define void @void_func_v3f16(<3 x half> %arg0) #0 { ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK: G_STORE [[EXTRACT]](<3 x s16>), [[DEF]](p1) :: (store 6 into `<3 x half> addrspace(1)* undef`, align 8, addrspace 1) + ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; CHECK: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[UV]](<3 x s16>), [[DEF1]](p1) :: (store 6 into `<3 x half> addrspace(1)* undef`, align 8, addrspace 1) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] ; CHECK: S_SETPC_B64_return [[COPY3]] store <3 x half> %arg0, <3 x half> addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll index a2b2817..7279a61 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll @@ -105,15 +105,16 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; UNPACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; UNPACKED: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; UNPACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>) - ; UNPACKED: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; UNPACKED: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; UNPACKED: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; UNPACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; UNPACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; UNPACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>) + ; UNPACKED: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; UNPACKED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; UNPACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; UNPACKED: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; UNPACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) ; UNPACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; UNPACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) @@ -137,9 +138,10 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; PACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; PACKED: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>) - ; PACKED: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; PACKED: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.store.2d), [[EXTRACT]](<3 x s16>), 7, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8) + ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>) + ; PACKED: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; PACKED: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.store.2d), [[UV]](<3 x s16>), 7, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void -- 2.7.4