From: Matt Arsenault Date: Sat, 8 Feb 2020 04:37:54 +0000 (-0500) Subject: AMDGPU/GlobalISel: Start selecting image intrinsics X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=48eda37282dbe81be5507cdacf7ff0ab7e059e72;p=platform%2Fupstream%2Fllvm.git AMDGPU/GlobalISel: Start selecting image intrinsics Does not handled atomics yet. --- diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h index e585608..7667507 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H +#include "AMDGPUInstrInfo.h" #include "llvm/CodeGen/Register.h" #include @@ -24,6 +25,37 @@ std::tuple getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg); bool isLegalVOP3PShuffleMask(ArrayRef Mask); + +/// Return number of address arguments, and the number of gradients for an image +/// intrinsic. +inline std::pair +getImageNumVAddr(const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode) { + const AMDGPU::MIMGDimInfo *DimInfo + = AMDGPU::getMIMGDimInfo(ImageDimIntr->Dim); + + int NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0; + int NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0; + int NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0; + int NumVAddr = BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM; + return {NumVAddr, NumGradients}; +} + +/// Return index of dmask in an gMIR image intrinsic +inline int getDMaskIdx(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode, + int NumDefs) { + assert(!BaseOpcode->Atomic); + return NumDefs + 1 + (BaseOpcode->Store ? 1 : 0); +} + +/// Return first address operand index in a gMIR image intrinsic. +inline int getImageVAddrIdxBegin(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode, + int NumDefs) { + if (BaseOpcode->Atomic) + return NumDefs + 1 + (BaseOpcode->AtomicX2 ? 2 : 1); + return getDMaskIdx(BaseOpcode, NumDefs) + 1; +} + } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index f3cfbc1..15d5bf2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1185,11 +1185,251 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, .addImm(Offset) .addImm(IsGDS ? -1 : 0) .cloneMemRefs(MI); - MI.eraseFromParent(); return true; } +static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, + bool &IsTexFail) { + if (TexFailCtrl) + IsTexFail = true; + + TFE = (TexFailCtrl & 0x1) ? 1 : 0; + TexFailCtrl &= ~(uint64_t)0x1; + LWE = (TexFailCtrl & 0x2) ? 1 : 0; + TexFailCtrl &= ~(uint64_t)0x2; + + return TexFailCtrl == 0; +} + +static bool parseCachePolicy(uint64_t Value, + bool *GLC, bool *SLC, bool *DLC) { + if (GLC) { + *GLC = (Value & 0x1) ? 1 : 0; + Value &= ~(uint64_t)0x1; + } + if (SLC) { + *SLC = (Value & 0x2) ? 1 : 0; + Value &= ~(uint64_t)0x2; + } + if (DLC) { + *DLC = (Value & 0x4) ? 1 : 0; + Value &= ~(uint64_t)0x4; + } + + return Value == 0; +} + +bool AMDGPUInstructionSelector::selectImageIntrinsic( + MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + + const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); + const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = + AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); + const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = + AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); + unsigned IntrOpcode = Intr->BaseOpcode; + const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10; + + const LLT S16 = LLT::scalar(16); + const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode, + MI.getNumExplicitDefs()); + int NumVAddr, NumGradients; + std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode); + + const LLT AddrTy = MRI->getType(MI.getOperand(VAddrIdx).getReg()); + const bool IsA16 = AddrTy.getScalarType() == S16; + + Register VData; + LLT VDataTy; + int NumVDataDwords = -1; + bool IsD16 = false; + + // XXX - Can we just get the second to last argument for ctrl? + unsigned CtrlIdx; // Index of texfailctrl argument + bool Unorm; + if (!BaseOpcode->Sampler) { + Unorm = true; + CtrlIdx = VAddrIdx + NumVAddr + 1; + } else { + Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0; + CtrlIdx = VAddrIdx + NumVAddr + 3; + } + + bool TFE; + bool LWE; + bool IsTexFail = false; + if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail)) + return false; + + unsigned DMask = 0; + unsigned DMaskLanes = 0; + + if (BaseOpcode->Atomic) { + return false; // TODO + } else { + const int DMaskIdx = 2; // Input/output + intrinsic ID. + + DMask = MI.getOperand(DMaskIdx).getImm(); + DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); + + if (BaseOpcode->Store) { + VData = MI.getOperand(1).getReg(); + VDataTy = MRI->getType(VData); + NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32; + } else { + VData = MI.getOperand(0).getReg(); + VDataTy = MRI->getType(VData); + NumVDataDwords = DMaskLanes; + + // One memoperand is mandatory, except for getresinfo. + // FIXME: Check this in verifier. + if (!MI.memoperands_empty()) { + const MachineMemOperand *MMO = *MI.memoperands_begin(); + + // Infer d16 from the memory size, as the register type will be mangled by + // unpacked subtargets, or by TFE. + IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32; + + if (IsD16 && !STI.hasUnpackedD16VMem()) + NumVDataDwords = (DMaskLanes + 1) / 2; + } + } + } + + // Optimize _L to _LZ when _L is zero + if (LZMappingInfo) { + // The legalizer replaced the register with an immediate 0 if we need to + // change the opcode. + const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); + if (Lod.isImm()) { + assert(Lod.getImm() == 0); + IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l + } + } + + // Optimize _mip away, when 'lod' is zero + if (MIPMappingInfo) { + const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); + if (Lod.isImm()) { + assert(Lod.getImm() == 0); + IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip + } + } + + // TODO: Check this in verifier. + assert(!IsTexFail || DMaskLanes >= 1 && "should have legalized this"); + + bool GLC = false; + bool SLC = false; + bool DLC = false; + if (BaseOpcode->Atomic) { + GLC = true; // TODO no-return optimization + if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC, + IsGFX10 ? &DLC : nullptr)) + return false; + } else { + if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC, + IsGFX10 ? &DLC : nullptr)) + return false; + } + + int NumVAddrRegs = 0; + int NumVAddrDwords = 0; + for (int I = 0; I < NumVAddr; ++I) { + // Skip the $noregs and 0s inserted during legalization. + MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I); + if (!AddrOp.isReg()) + continue; // XXX - Break? + + Register Addr = AddrOp.getReg(); + if (!Addr) + break; + + ++NumVAddrRegs; + NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32; + } + + // The legalizer preprocessed the intrinsic arguments. If we aren't using + // NSA, these should have beeen packed into a single value in the first + // address register + const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs; + if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { + LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n"); + return false; + } + + if (IsTexFail) + ++NumVDataDwords; + + int Opcode = -1; + if (IsGFX10) { + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, + UseNSA ? AMDGPU::MIMGEncGfx10NSA + : AMDGPU::MIMGEncGfx10Default, + NumVDataDwords, NumVAddrDwords); + } else { + if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, + NumVDataDwords, NumVAddrDwords); + if (Opcode == -1) + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, + NumVDataDwords, NumVAddrDwords); + } + assert(Opcode != -1); + + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode)) + .cloneMemRefs(MI); + + if (!BaseOpcode->Store || BaseOpcode->Atomic) + MIB.addDef(VData); // vdata output + + if (BaseOpcode->Store || BaseOpcode->Atomic) + MIB.addReg(VData); // vdata input + + for (int i = 0; i != NumVAddrRegs; ++i) { + MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i); + if (SrcOp.isReg()) { + assert(SrcOp.getReg() != 0); + MIB.addReg(SrcOp.getReg()); + } + } + + MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc + if (BaseOpcode->Sampler) + MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler + + MIB.addImm(DMask); // dmask + + if (IsGFX10) + MIB.addImm(DimInfo->Encoding); + MIB.addImm(Unorm); + if (IsGFX10) + MIB.addImm(DLC); + + MIB.addImm(GLC); + MIB.addImm(SLC); + MIB.addImm(IsA16 && // a16 or r128 + STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0); + if (IsGFX10) + MIB.addImm(IsA16 ? -1 : 0); + + MIB.addImm(TFE); // tfe + MIB.addImm(LWE); // lwe + if (!IsGFX10) + MIB.addImm(DimInfo->DA ? -1 : 0); + if (BaseOpcode->HasD16) + MIB.addImm(IsD16 ? -1 : 0); + + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( MachineInstr &I) const { unsigned IntrinsicID = I.getIntrinsicID(); @@ -1210,9 +1450,10 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( return selectDSAppendConsume(I, true); case Intrinsic::amdgcn_ds_consume: return selectDSAppendConsume(I, false); - default: + default: { return selectImpl(I, *CoverageInfo); } + } } bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { @@ -2371,6 +2612,13 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case AMDGPU::G_AMDGPU_ATOMIC_DEC: initM0(I); return selectImpl(I, *CoverageInfo); + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { + const AMDGPU::ImageDimIntrinsicInfo *Intr + = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID()); + assert(Intr && "not an image intrinsic with image pseudo"); + return selectImageIntrinsic(I, Intr); + } default: return selectImpl(I, *CoverageInfo); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index dd29fb4..8557756 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -31,6 +31,10 @@ namespace { namespace llvm { +namespace AMDGPU { +struct ImageDimIntrinsicInfo; +} + class AMDGPUInstrInfo; class AMDGPURegisterBankInfo; class GCNSubtarget; @@ -107,6 +111,8 @@ private: bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const; bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const; + bool selectImageIntrinsic(MachineInstr &MI, + const AMDGPU::ImageDimIntrinsicInfo *Intr) const; bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I) const; int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const; bool selectG_ICMP(MachineInstr &I) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 875fa57..8de8b33 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3619,34 +3619,6 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, } } -/// Return number of address arguments, and the number of gradients -static std::pair -getImageNumVAddr(const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, - const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode) { - const AMDGPU::MIMGDimInfo *DimInfo - = AMDGPU::getMIMGDimInfo(ImageDimIntr->Dim); - - int NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0; - int NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0; - int NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0; - int NumVAddr = BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM; - return {NumVAddr, NumGradients}; -} - -static int getDMaskIdx(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode, - int NumDefs) { - assert(!BaseOpcode->Atomic); - return NumDefs + 1 + (BaseOpcode->Store ? 1 : 0); -} - -/// Return first address operand index in an image intrinsic. -static int getImageVAddrIdxBegin(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode, - int NumDefs) { - if (BaseOpcode->Atomic) - return NumDefs + 1 + (BaseOpcode->AtomicX2 ? 2 : 1); - return getDMaskIdx(BaseOpcode, NumDefs) + 1; -} - /// Rewrite image intrinsics to use register layouts expected by the subtarget. /// /// Depending on the subtarget, load/store with 16-bit element data need to be @@ -3772,8 +3744,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( // // SIShrinkInstructions will convert NSA encodings to non-NSA after register // allocation when possible. - const bool UseNSA = CorrectedNumVAddrs >= 3 && - ST.hasFeature(AMDGPU::FeatureNSAEncoding); + const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); // Rewrite the addressing register layout before doing anything else. if (IsA16) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 3f07876..7a49f76 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2920,6 +2920,10 @@ AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, continue; Register OpReg = MI.getOperand(I).getReg(); + // We replace some dead address operands with $noreg + if (!OpReg) + continue; + unsigned Size = getSizeInBits(OpReg, MRI, *TRI); // FIXME: Probably need a new intrinsic register bank searchable table to diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.ll index dd4b193..afcc55a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.ll @@ -1,600 +1,398 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX10 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s) { - ; GFX9-LABEL: name: load_mip_1d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[COPY8]](s32), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: load_mip_1d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[COPY8]](s32), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: load_mip_1d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_mip_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 0, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { - ; GFX9-LABEL: name: load_mip_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: load_mip_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: load_mip_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_mip_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %u) { - ; GFX9-LABEL: name: load_mip_3d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[BUILD_VECTOR1]](<3 x s32>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: load_mip_3d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: load_mip_3d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_mip_3d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %u, i32 0, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { - ; GFX9-LABEL: name: load_mip_1darray - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: load_mip_1darray - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: load_mip_1darray: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_mip_1darray: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %u) { - ; GFX9-LABEL: name: load_mip_2darray - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[BUILD_VECTOR1]](<3 x s32>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: load_mip_2darray - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: load_mip_2darray: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_mip_2darray: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %u, i32 0, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %u) { - ; GFX9-LABEL: name: load_mip_cube - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[BUILD_VECTOR1]](<3 x s32>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: load_mip_cube - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: load_mip_cube: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_mip_cube: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %u, i32 0, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { - ; GFX9-LABEL: name: store_mip_1d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[COPY12]](s32), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") - ; GFX9: S_ENDPGM 0 - ; GFX10-LABEL: name: store_mip_1d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[COPY12]](s32), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") - ; GFX10: S_ENDPGM 0 +; GFX9-LABEL: store_mip_1d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_mip_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 0, <8 x i32> %rsrc, i32 0, i32 0) ret void } define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) { - ; GFX9-LABEL: name: store_mip_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s32>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") - ; GFX9: S_ENDPGM 0 - ; GFX10-LABEL: name: store_mip_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32) - ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s32>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") - ; GFX10: S_ENDPGM 0 +; GFX9-LABEL: store_mip_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_mip_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX10-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) ret void } define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %u) { - ; GFX9-LABEL: name: store_mip_3d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<3 x s32>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") - ; GFX9: S_ENDPGM 0 - ; GFX10-LABEL: name: store_mip_3d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") - ; GFX10: S_ENDPGM 0 +; GFX9-LABEL: store_mip_3d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_mip_3d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm +; GFX10-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %u, i32 0, <8 x i32> %rsrc, i32 0, i32 0) ret void } define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) { - ; GFX9-LABEL: name: store_mip_1darray - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s32>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") - ; GFX9: S_ENDPGM 0 - ; GFX10-LABEL: name: store_mip_1darray - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32) - ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s32>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") - ; GFX10: S_ENDPGM 0 +; GFX9-LABEL: store_mip_1darray: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm da +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_mip_1darray: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX10-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) ret void } define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %u) { - ; GFX9-LABEL: name: store_mip_2darray - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<3 x s32>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") - ; GFX9: S_ENDPGM 0 - ; GFX10-LABEL: name: store_mip_2darray - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") - ; GFX10: S_ENDPGM 0 +; GFX9-LABEL: store_mip_2darray: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm da +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_mip_2darray: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX10-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %u, i32 0, <8 x i32> %rsrc, i32 0, i32 0) ret void } define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %u) { - ; GFX9-LABEL: name: store_mip_cube - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<3 x s32>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") - ; GFX9: S_ENDPGM 0 - ; GFX10-LABEL: name: store_mip_cube - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") - ; GFX10: S_ENDPGM 0 +; GFX9-LABEL: store_mip_cube: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm da +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_mip_cube: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm +; GFX10-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %u, i32 0, <8 x i32> %rsrc, i32 0, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll index d798e26..dd99fbd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -1,1014 +1,760 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX10NSA %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { - ; GFX9-LABEL: name: gather4_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.2d), 1, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.2d), 1, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b64 s[14:15], exec +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10NSA-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) { - ; GFX9-LABEL: name: gather4_cube - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.cube), 1, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_cube - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.cube), 1, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_cube: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b64 s[14:15], exec +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_cube: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f16(i32 1, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) { - ; GFX9-LABEL: name: gather4_2darray - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.2darray), 1, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_2darray - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.2darray), 1, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_2darray: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b64 s[14:15], exec +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_2darray: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f16(i32 1, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { - ; GFX9-LABEL: name: gather4_c_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.2d), 1, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_c_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.2d), 1, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_c_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b64 s[14:15], exec +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_c_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f16(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %clamp) { - ; GFX9-LABEL: name: gather4_cl_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.cl.2d), 1, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_cl_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.cl.2d), 1, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_cl_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b64 s[14:15], exec +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_cl_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f16(i32 1, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %clamp) { - ; GFX9-LABEL: name: gather4_c_cl_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) - ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.cl.2d), 1, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_c_cl_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX10NSA: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) - ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.cl.2d), 1, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_c_cl_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b64 s[14:15], exec +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 +; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_c_cl_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 +; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f16(i32 1, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) { - ; GFX9-LABEL: name: gather4_b_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.b.2d), 1, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_b_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.b.2d), 1, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_b_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b64 s[14:15], exec +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_b_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f16(i32 1, float %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) { - ; GFX9-LABEL: name: gather4_c_b_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.b.2d), 1, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_c_b_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) - ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.b.2d), 1, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_c_b_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b64 s[14:15], exec +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_c_b_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10NSA-NEXT: v_and_or_b32 v2, v2, 0xffff, v3 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f16(i32 1, float %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) { - ; GFX9-LABEL: name: gather4_b_cl_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) - ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.b.cl.2d), 1, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_b_cl_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX10NSA: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) - ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.b.cl.2d), 1, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_b_cl_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b64 s[14:15], exec +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 +; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_b_cl_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 +; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f16(i32 1, float %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) { - ; GFX9-LABEL: name: gather4_c_b_cl_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[COPY18]](s32) - ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) - ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[DEF]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.b.cl.2d), 1, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_c_b_cl_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10NSA: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) - ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[COPY18]](s32) - ; GFX10NSA: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) - ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.b.cl.2d), 1, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_c_b_cl_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b64 s[14:15], exec +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 +; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s12 +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_c_b_cl_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v5, v3 +; GFX10NSA-NEXT: v_and_or_b32 v3, v4, v5, s12 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f16(i32 1, float %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) { - ; GFX9-LABEL: name: gather4_l_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.l.2d), 1, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_l_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.l.2d), 1, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_l_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX9-NEXT: image_gather4_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_l_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: image_gather4_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f16(i32 1, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) { - ; GFX9-LABEL: name: gather4_c_l_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) - ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.l.2d), 1, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_c_l_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX10NSA: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) - ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.l.2d), 1, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_c_l_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX9-NEXT: image_gather4_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_c_l_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: image_gather4_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f16(i32 1, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { - ; GFX9-LABEL: name: gather4_lz_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.lz.2d), 1, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_lz_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.lz.2d), 1, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_lz_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_lz_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { - ; GFX9-LABEL: name: gather4_c_lz_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.lz.2d), 1, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_c_lz_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.lz.2d), 1, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_c_lz_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX9-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_c_lz_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f16(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll index d63be88..bfdbe9a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll @@ -1,1047 +1,794 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX10NSA %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { - ; GFX6-LABEL: name: gather4_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.2d), 1, [[BUILD_VECTOR2]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.2d), 1, [[BUILD_VECTOR2]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) { - ; GFX6-LABEL: name: gather4_cube - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.cube), 1, [[BUILD_VECTOR2]](<3 x s32>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_cube - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.cube), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_cube: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 da +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_cube: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32 1, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %slice) { - ; GFX6-LABEL: name: gather4_2darray - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.2darray), 1, [[BUILD_VECTOR2]](<3 x s32>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_2darray - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.2darray), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_2darray: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 da +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_2darray: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32 1, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) { - ; GFX6-LABEL: name: gather4_c_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.2d), 1, [[BUILD_VECTOR2]](<3 x s32>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_c_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_c_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_c_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %clamp) { - ; GFX6-LABEL: name: gather4_cl_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.cl.2d), 1, [[BUILD_VECTOR2]](<3 x s32>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_cl_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.cl.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_cl_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_cl_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 1, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %clamp) { - ; GFX6-LABEL: name: gather4_c_cl_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.cl.2d), 1, [[BUILD_VECTOR2]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_c_cl_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.cl.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_c_cl_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_c_cl_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { - ; GFX6-LABEL: name: gather4_b_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.b.2d), 1, [[BUILD_VECTOR2]](<3 x s32>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_b_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.b.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_b_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_b_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t) { - ; GFX6-LABEL: name: gather4_c_b_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.b.2d), 1, [[BUILD_VECTOR2]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_c_b_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.b.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_c_b_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_c_b_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t, float %clamp) { - ; GFX6-LABEL: name: gather4_b_cl_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.b.cl.2d), 1, [[BUILD_VECTOR2]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_b_cl_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.b.cl.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_b_cl_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_b_cl_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) { - ; GFX6-LABEL: name: gather4_c_b_cl_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX6: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.b.cl.2d), 1, [[BUILD_VECTOR2]](<8 x s32>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_c_b_cl_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.b.cl.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_c_b_cl_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_c_b_cl_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) { - ; GFX6-LABEL: name: gather4_l_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.l.2d), 1, [[BUILD_VECTOR2]](<3 x s32>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_l_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.l.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_l_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: image_gather4_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_l_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: image_gather4_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 1, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) { - ; GFX6-LABEL: name: gather4_c_l_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.l.2d), 1, [[BUILD_VECTOR2]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_c_l_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.l.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_c_l_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: image_gather4_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_c_l_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: image_gather4_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { - ; GFX6-LABEL: name: gather4_lz_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.lz.2d), 1, [[BUILD_VECTOR2]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_lz_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.lz.2d), 1, [[BUILD_VECTOR2]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_lz_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_lz_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) { - ; GFX6-LABEL: name: gather4_c_lz_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.lz.2d), 1, [[BUILD_VECTOR2]](<3 x s32>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_c_lz_2d - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.lz.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_c_lz_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: image_gather4_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_c_lz_2d: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: image_gather4_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_2d_dmask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { - ; GFX6-LABEL: name: gather4_2d_dmask_2 - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.2d), 2, [[BUILD_VECTOR2]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_2d_dmask_2 - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.2d), 2, [[BUILD_VECTOR2]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_2d_dmask_2: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_2d_dmask_2: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_2D +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 2, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_2d_dmask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { - ; GFX6-LABEL: name: gather4_2d_dmask_4 - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.2d), 4, [[BUILD_VECTOR2]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_2d_dmask_4 - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.2d), 4, [[BUILD_VECTOR2]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_2d_dmask_4: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_2d_dmask_4: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 4, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_2d_dmask_8(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { - ; GFX6-LABEL: name: gather4_2d_dmask_8 - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.2d), 8, [[BUILD_VECTOR2]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10NSA-LABEL: name: gather4_2d_dmask_8 - ; GFX10NSA: bb.1.main_body: - ; GFX10NSA: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10NSA: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10NSA: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10NSA: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10NSA: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10NSA: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10NSA: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.2d), 8, [[BUILD_VECTOR2]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) - ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_2d_dmask_8: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10NSA-LABEL: gather4_2d_dmask_8: +; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_2D +; GFX10NSA-NEXT: s_waitcnt vmcnt(0) +; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 8, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll index 69a75ea..de5cd80 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll @@ -1,771 +1,548 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX10 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s define amdgpu_ps <4 x float> @gather4_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t) { - ; GFX6-LABEL: name: gather4_o_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.o.2d), 1, [[BUILD_VECTOR2]](<3 x s32>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: gather4_o_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.o.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_o_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: gather4_o_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s14, exec_lo +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10-NEXT: image_gather4_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t) { - ; GFX6-LABEL: name: gather4_c_o_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.o.2d), 1, [[BUILD_VECTOR2]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: gather4_c_o_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.o.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_c_o_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4_c_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: gather4_c_o_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s14, exec_lo +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10-NEXT: image_gather4_c_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %clamp) { - ; GFX6-LABEL: name: gather4_cl_o_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.cl.o.2d), 1, [[BUILD_VECTOR2]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: gather4_cl_o_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.cl.o.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_cl_o_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4_cl_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: gather4_cl_o_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s14, exec_lo +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10-NEXT: image_gather4_cl_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %clamp) { - ; GFX6-LABEL: name: gather4_c_cl_o_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX6: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.cl.o.2d), 1, [[BUILD_VECTOR2]](<8 x s32>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: gather4_c_cl_o_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.cl.o.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_c_cl_o_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4_c_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: gather4_c_cl_o_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s14, exec_lo +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10-NEXT: image_gather4_c_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_b_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %s, float %t) { - ; GFX6-LABEL: name: gather4_b_o_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.b.o.2d), 1, [[BUILD_VECTOR2]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: gather4_b_o_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.b.o.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_b_o_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4_b_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: gather4_b_o_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s14, exec_lo +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10-NEXT: image_gather4_b_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_b_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %zcompare, float %s, float %t) { - ; GFX6-LABEL: name: gather4_c_b_o_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX6: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.b.o.2d), 1, [[BUILD_VECTOR2]](<8 x s32>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: gather4_c_b_o_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.b.o.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_c_b_o_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4_c_b_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: gather4_c_b_o_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s14, exec_lo +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10-NEXT: image_gather4_c_b_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_b_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %s, float %t, float %clamp) { - ; GFX6-LABEL: name: gather4_b_cl_o_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX6: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.b.cl.o.2d), 1, [[BUILD_VECTOR2]](<8 x s32>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: gather4_b_cl_o_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.b.cl.o.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_b_cl_o_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: image_gather4_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: gather4_b_cl_o_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_gather4_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_b_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp) { - ; GFX6-LABEL: name: gather4_c_b_cl_o_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX6: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX6: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[DEF]](s32), [[DEF]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.b.cl.o.2d), 1, [[BUILD_VECTOR2]](<8 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: gather4_c_b_cl_o_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.b.cl.o.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_c_b_cl_o_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b64 s[14:15], exec +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: s_wqm_b64 exec, exec +; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6-NEXT: image_gather4_c_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: gather4_c_b_cl_o_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s14, exec_lo +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10-NEXT: image_gather4_c_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %lod) { - ; GFX6-LABEL: name: gather4_l_o_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.l.o.2d), 1, [[BUILD_VECTOR2]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: gather4_l_o_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.l.o.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_l_o_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: image_gather4_l_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: gather4_l_o_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_gather4_l_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %lod) { - ; GFX6-LABEL: name: gather4_c_l_o_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX6: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.l.o.2d), 1, [[BUILD_VECTOR2]](<8 x s32>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: gather4_c_l_o_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.l.o.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_c_l_o_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: image_gather4_c_l_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: gather4_c_l_o_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_gather4_c_l_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_lz_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t) { - ; GFX6-LABEL: name: gather4_lz_o_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.lz.o.2d), 1, [[BUILD_VECTOR2]](<3 x s32>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: gather4_lz_o_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.lz.o.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_lz_o_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: image_gather4_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: gather4_lz_o_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_gather4_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_lz_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t) { - ; GFX6-LABEL: name: gather4_c_lz_o_2d - ; GFX6: bb.1.main_body: - ; GFX6: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX6: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.lz.o.2d), 1, [[BUILD_VECTOR2]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6: $vgpr0 = COPY [[UV]](s32) - ; GFX6: $vgpr1 = COPY [[UV1]](s32) - ; GFX6: $vgpr2 = COPY [[UV2]](s32) - ; GFX6: $vgpr3 = COPY [[UV3]](s32) - ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: gather4_c_lz_o_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.lz.o.2d), 1, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX6-LABEL: gather4_c_lz_o_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b32 s10, s12 +; GFX6-NEXT: s_mov_b32 s11, s13 +; GFX6-NEXT: image_gather4_c_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: gather4_c_lz_o_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_gather4_c_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll new file mode 100644 index 0000000..d4084ba --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll @@ -0,0 +1,301 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i16 %mip) { +; GFX9-LABEL: getresinfo_1d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, i16 %mip) { +; GFX9-LABEL: getresinfo_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, i16 %mip) { +; GFX9-LABEL: getresinfo_3d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_3d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, i16 %mip) { +; GFX9-LABEL: getresinfo_cube: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_cube: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, i16 %mip) { +; GFX9-LABEL: getresinfo_1darray: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_1darray: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, i16 %mip) { +; GFX9-LABEL: getresinfo_2darray: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_2darray: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, i16 %mip) { +; GFX9-LABEL: getresinfo_2dmsaa: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_2dmsaa: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, i16 %mip) { +; GFX9-LABEL: getresinfo_2darraymsaa: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_2darraymsaa: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x float> %vdata, i16 %mip) { +; GFX9-LABEL: getresinfo_dmask0: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_dmask0: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog +main_body: + %r = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i16(i32 0, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %r +} + +declare <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i16(i32 immarg, i16, <8 x i32>, i32 immarg, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i16(i32 immarg, i16, <8 x i32>, i32 immarg, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i16(i32 immarg, i16, <8 x i32>, i32 immarg, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i16(i32 immarg, i16, <8 x i32>, i32 immarg, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i16(i32 immarg, i16, <8 x i32>, i32 immarg, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i16(i32 immarg, i16, <8 x i32>, i32 immarg, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i16(i32 immarg, i16, <8 x i32>, i32 immarg, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i16(i32 immarg, i16, <8 x i32>, i32 immarg, i32 immarg) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll new file mode 100644 index 0000000..6b97530 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll @@ -0,0 +1,418 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i32 %mip) { +; GFX6-LABEL: getresinfo_1d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: getresinfo_1d: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, i32 %mip) { +; GFX6-LABEL: getresinfo_2d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: getresinfo_2d: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, i32 %mip) { +; GFX6-LABEL: getresinfo_3d: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: getresinfo_3d: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_3d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, i32 %mip) { +; GFX6-LABEL: getresinfo_cube: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: getresinfo_cube: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_cube: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, i32 %mip) { +; GFX6-LABEL: getresinfo_1darray: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: getresinfo_1darray: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_1darray: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, i32 %mip) { +; GFX6-LABEL: getresinfo_2darray: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: getresinfo_2darray: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_2darray: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, i32 %mip) { +; GFX6-LABEL: getresinfo_2dmsaa: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: getresinfo_2dmsaa: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_2dmsaa: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, i32 %mip) { +; GFX6-LABEL: getresinfo_2darraymsaa: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: getresinfo_2darraymsaa: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_2darraymsaa: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %mip) { +; GFX6-LABEL: getresinfo_dmask0: +; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: getresinfo_dmask0: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: getresinfo_dmask0: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog +main_body: + %r = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 0, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %r +} + +declare <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll new file mode 100644 index 0000000..bfb2c84 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -0,0 +1,824 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-UNPACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-PACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_ps half @load_1d_f16_x(<8 x i32> inreg %rsrc, i32 %s) { +; GFX8-UNPACKED-LABEL: load_1d_f16_x: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-UNPACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x1 unorm d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: ; return to shader part epilog +; +; GFX8-PACKED-LABEL: load_1d_f16_x: +; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-PACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-PACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x1 unorm d16 +; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-PACKED-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_1d_f16_x: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x1 unorm d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_f16_x: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret half %v +} + +define amdgpu_ps half @load_1d_f16_y(<8 x i32> inreg %rsrc, i32 %s) { +; GFX8-UNPACKED-LABEL: load_1d_f16_y: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-UNPACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x2 unorm d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: ; return to shader part epilog +; +; GFX8-PACKED-LABEL: load_1d_f16_y: +; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-PACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-PACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x2 unorm d16 +; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-PACKED-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_1d_f16_y: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x2 unorm d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_f16_y: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret half %v +} + +define amdgpu_ps half @load_1d_f16_z(<8 x i32> inreg %rsrc, i32 %s) { +; GFX8-UNPACKED-LABEL: load_1d_f16_z: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-UNPACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x4 unorm d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: ; return to shader part epilog +; +; GFX8-PACKED-LABEL: load_1d_f16_z: +; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-PACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-PACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x4 unorm d16 +; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-PACKED-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_1d_f16_z: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x4 unorm d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_f16_z: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 4, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret half %v +} + +define amdgpu_ps half @load_1d_f16_w(<8 x i32> inreg %rsrc, i32 %s) { +; GFX8-UNPACKED-LABEL: load_1d_f16_w: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-UNPACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x8 unorm d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: ; return to shader part epilog +; +; GFX8-PACKED-LABEL: load_1d_f16_w: +; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-PACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-PACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x8 unorm d16 +; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-PACKED-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_1d_f16_w: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x8 unorm d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_f16_w: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret half %v +} + +define amdgpu_ps <2 x half> @load_1d_v2f16_xy(<8 x i32> inreg %rsrc, i32 %s) { +; GFX8-UNPACKED-LABEL: load_1d_v2f16_xy: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-UNPACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-UNPACKED-NEXT: ; return to shader part epilog +; +; GFX8-PACKED-LABEL: load_1d_v2f16_xy: +; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-PACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-PACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x3 unorm d16 +; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-PACKED-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_1d_v2f16_xy: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x3 unorm d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v2f16_xy: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <2 x half> %v +} + +define amdgpu_ps <2 x half> @load_1d_v2f16_xz(<8 x i32> inreg %rsrc, i32 %s) { +; GFX8-UNPACKED-LABEL: load_1d_v2f16_xz: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-UNPACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x5 unorm d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-UNPACKED-NEXT: ; return to shader part epilog +; +; GFX8-PACKED-LABEL: load_1d_v2f16_xz: +; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-PACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-PACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x5 unorm d16 +; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-PACKED-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_1d_v2f16_xz: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x5 unorm d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v2f16_xz: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x5 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 5, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <2 x half> %v +} + +define amdgpu_ps <2 x half> @load_1d_v2f16_xw(<8 x i32> inreg %rsrc, i32 %s) { +; GFX8-UNPACKED-LABEL: load_1d_v2f16_xw: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-UNPACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-UNPACKED-NEXT: ; return to shader part epilog +; +; GFX8-PACKED-LABEL: load_1d_v2f16_xw: +; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-PACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-PACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x9 unorm d16 +; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-PACKED-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_1d_v2f16_xw: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x9 unorm d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v2f16_xw: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <2 x half> %v +} + +define amdgpu_ps <2 x half> @load_1d_v2f16_yz(<8 x i32> inreg %rsrc, i32 %s) { +; GFX8-UNPACKED-LABEL: load_1d_v2f16_yz: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-UNPACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x6 unorm d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-UNPACKED-NEXT: ; return to shader part epilog +; +; GFX8-PACKED-LABEL: load_1d_v2f16_yz: +; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-PACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-PACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x6 unorm d16 +; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-PACKED-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_1d_v2f16_yz: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x6 unorm d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v2f16_yz: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <2 x half> %v +} + +; FIXME: +; define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) { +; %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) +; ret <3 x half> %v +; } + +define amdgpu_ps <4 x half> @load_1d_v4f16_xyzw(<8 x i32> inreg %rsrc, i32 %s) { +; GFX8-UNPACKED-LABEL: load_1d_v4f16_xyzw: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-UNPACKED-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm d16 +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0xffff +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-UNPACKED-NEXT: ; return to shader part epilog +; +; GFX8-PACKED-LABEL: load_1d_v4f16_xyzw: +; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-PACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-PACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0xf unorm d16 +; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-PACKED-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_1d_v4f16_xyzw: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0xf unorm d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v4f16_xyzw: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x half> %v +} + +define amdgpu_ps float @load_1d_f16_tfe_dmask_x(<8 x i32> inreg %rsrc, i32 %s) { +; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask_x: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-UNPACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-UNPACKED-NEXT: ; return to shader part epilog +; +; GFX8-PACKED-LABEL: load_1d_f16_tfe_dmask_x: +; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-PACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-PACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe d16 +; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-PACKED-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-PACKED-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_1d_f16_tfe_dmask_x: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_f16_tfe_dmask_x: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.err = extractvalue { half, i32 } %v, 1 + %vv = bitcast i32 %v.err to float + ret float %vv +} + +define amdgpu_ps float @load_1d_v2f16_tfe_dmask_xy(<8 x i32> inreg %rsrc, i32 %s) { +; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask_xy: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-UNPACKED-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x3 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-UNPACKED-NEXT: ; return to shader part epilog +; +; GFX8-PACKED-LABEL: load_1d_v2f16_tfe_dmask_xy: +; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-PACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-PACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm tfe d16 +; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-PACKED-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-PACKED-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_1d_v2f16_tfe_dmask_xy: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm tfe d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v2f16_tfe_dmask_xy: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.err = extractvalue { <2 x half>, i32 } %v, 1 + %vv = bitcast i32 %v.err to float + ret float %vv +} + +; FIXME: +; define amdgpu_ps float @load_1d_v3f16_tfe_dmask_xyz(<8 x i32> inreg %rsrc, i32 %s) { +; %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) +; %v.err = extractvalue { <3 x half>, i32 } %v, 1 +; %vv = bitcast i32 %v.err to float +; ret float %vv +; } + +define amdgpu_ps float @load_1d_v4f16_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32 %s) { +; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask_xyzw: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-UNPACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x10 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-UNPACKED-NEXT: ; return to shader part epilog +; +; GFX8-PACKED-LABEL: load_1d_v4f16_tfe_dmask_xyzw: +; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-PACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-PACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: s_nop 0 +; GFX8-PACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x10 unorm tfe d16 +; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-PACKED-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-PACKED-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_1d_v4f16_tfe_dmask_xyzw: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x10 unorm tfe d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v4f16_tfe_dmask_xyzw: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x10 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %v = call { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 16, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.err = extractvalue { <4 x half>, i32 } %v, 1 + %vv = bitcast i32 %v.err to float + ret float %vv +} + +declare half @llvm.amdgcn.image.load.1d.half.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 + +declare { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 + +attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll new file mode 100644 index 0000000..0062118 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll @@ -0,0 +1,746 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_ps float @load_1d_f32_x(<8 x i32> inreg %rsrc, i32 %s) { +; GFX6-LABEL: load_1d_f32_x: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v0, v0, s[0:7] dmask:0x1 unorm +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: load_1d_f32_x: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_load v0, v0, s[0:7] dmask:0x1 unorm +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_f32_x: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret float %v +} + +define amdgpu_ps float @load_1d_f32_y(<8 x i32> inreg %rsrc, i32 %s) { +; GFX6-LABEL: load_1d_f32_y: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v0, v0, s[0:7] dmask:0x2 unorm +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: load_1d_f32_y: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_load v0, v0, s[0:7] dmask:0x2 unorm +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_f32_y: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret float %v +} + +define amdgpu_ps float @load_1d_f32_z(<8 x i32> inreg %rsrc, i32 %s) { +; GFX6-LABEL: load_1d_f32_z: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v0, v0, s[0:7] dmask:0x4 unorm +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: load_1d_f32_z: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_load v0, v0, s[0:7] dmask:0x4 unorm +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_f32_z: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_1D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 4, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret float %v +} + +define amdgpu_ps float @load_1d_f32_w(<8 x i32> inreg %rsrc, i32 %s) { +; GFX6-LABEL: load_1d_f32_w: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v0, v0, s[0:7] dmask:0x8 unorm +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: load_1d_f32_w: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_load v0, v0, s[0:7] dmask:0x8 unorm +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_f32_w: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret float %v +} + +define amdgpu_ps <2 x float> @load_1d_v2f32_xy(<8 x i32> inreg %rsrc, i32 %s) { +; GFX6-LABEL: load_1d_v2f32_xy: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: load_1d_v2f32_xy: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v2f32_xy: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <2 x float> %v +} + +define amdgpu_ps <2 x float> @load_1d_v2f32_xz(<8 x i32> inreg %rsrc, i32 %s) { +; GFX6-LABEL: load_1d_v2f32_xz: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x5 unorm +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: load_1d_v2f32_xz: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x5 unorm +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v2f32_xz: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x5 dim:SQ_RSRC_IMG_1D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 5, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <2 x float> %v +} + +define amdgpu_ps <2 x float> @load_1d_v2f32_xw(<8 x i32> inreg %rsrc, i32 %s) { +; GFX6-LABEL: load_1d_v2f32_xw: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: load_1d_v2f32_xw: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v2f32_xw: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <2 x float> %v +} + +define amdgpu_ps <2 x float> @load_1d_v2f32_yz(<8 x i32> inreg %rsrc, i32 %s) { +; GFX6-LABEL: load_1d_v2f32_yz: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x6 unorm +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: load_1d_v2f32_yz: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x6 unorm +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v2f32_yz: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <2 x float> %v +} + +define amdgpu_ps <3 x float> @load_1d_v3f32_xyz(<8 x i32> inreg %rsrc, i32 %s) { +; GFX6-LABEL: load_1d_v3f32_xyz: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: load_1d_v3f32_xyz: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v3f32_xyz: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call <3 x float> @llvm.amdgcn.image.load.1d.v3f32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <3 x float> %v +} + +define amdgpu_ps <4 x float> @load_1d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s) { +; GFX6-LABEL: load_1d_v4f32_xyzw: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: load_1d_v4f32_xyzw: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v4f32_xyzw: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps float @load_1d_f32_tfe_dmask_x(<8 x i32> inreg %rsrc, i32 %s) { +; GFX6-LABEL: load_1d_f32_tfe_dmask_x: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: load_1d_f32_tfe_dmask_x: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_f32_tfe_dmask_x: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %v = call { float, i32 } @llvm.amdgcn.image.load.1d.sl_f32i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.err = extractvalue { float, i32 } %v, 1 + %vv = bitcast i32 %v.err to float + ret float %vv +} + +define amdgpu_ps float @load_1d_v2f32_tfe_dmask_xy(<8 x i32> inreg %rsrc, i32 %s) { +; GFX6-LABEL: load_1d_v2f32_tfe_dmask_xy: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x3 unorm tfe +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: load_1d_v2f32_tfe_dmask_xy: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x3 unorm tfe +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v2f32_tfe_dmask_xy: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: ; return to shader part epilog + %v = call { <2 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f32i32s.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.err = extractvalue { <2 x float>, i32 } %v, 1 + %vv = bitcast i32 %v.err to float + ret float %vv +} + +define amdgpu_ps float @load_1d_v3f32_tfe_dmask_xyz(<8 x i32> inreg %rsrc, i32 %s) { +; GFX6-LABEL: load_1d_v3f32_tfe_dmask_xyz: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm tfe +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: load_1d_v3f32_tfe_dmask_xyz: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm tfe +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v3f32_tfe_dmask_xyz: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: ; return to shader part epilog + %v = call { <3 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f32i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.err = extractvalue { <3 x float>, i32 } %v, 1 + %vv = bitcast i32 %v.err to float + ret float %vv +} + +define amdgpu_ps float @load_1d_v4f32_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32 %s) { +; GFX6-LABEL: load_1d_v4f32_tfe_dmask_xyzw: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x10 unorm tfe +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: load_1d_v4f32_tfe_dmask_xyzw: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x10 unorm tfe +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v4f32_tfe_dmask_xyzw: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x10 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 16, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.err = extractvalue { <4 x float>, i32 } %v, 1 + %vv = bitcast i32 %v.err to float + ret float %vv +} + +define amdgpu_ps float @load_1d_f32_tfe_dmask_0(<8 x i32> inreg %rsrc, i32 %s) { +; GFX6-LABEL: load_1d_f32_tfe_dmask_0: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: load_1d_f32_tfe_dmask_0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_f32_tfe_dmask_0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %v = call { float, i32 } @llvm.amdgcn.image.load.1d.sl_f32i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.err = extractvalue { float, i32 } %v, 1 + %vv = bitcast i32 %v.err to float + ret float %vv +} + +declare float @llvm.amdgcn.image.load.1d.f32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare <3 x float> @llvm.amdgcn.image.load.1d.v3f32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 + +declare { float, i32 } @llvm.amdgcn.image.load.1d.sl_f32i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare { <2 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f32i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare { <3 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f32i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 + +attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll new file mode 100644 index 0000000..fdaca8f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { +; GFX6-LABEL: load_2d_v4f32_xyzw: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_2d_v4f32_xyzw: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) { +; GFX6-LABEL: load_2d_v4f32_xyzw_tfe: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm tfe +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v4, off, s[8:11], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_2d_v4f32_xyzw_tfe: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v6, s11 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ; return to shader part epilog + %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %v.vec = extractvalue { <4 x float>, i32 } %v, 0 + %v.err = extractvalue { <4 x float>, i32 } %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) { +; GFX6-LABEL: load_2d_v4f32_xyzw_tfe_lwe: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm tfe lwe +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v4, off, s[8:11], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_2d_v4f32_xyzw_tfe_lwe: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v6, s11 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe lwe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ; return to shader part epilog + %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 3, i32 0) + %v.vec = extractvalue { <4 x float>, i32 } %v, 0 + %v.err = extractvalue { <4 x float>, i32 } %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 + +attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll new file mode 100644 index 0000000..4a20c7d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %slice, i16 %fragid) { +; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v1 +; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: v_and_or_b32 v0, v0, v4, v1 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_and_or_b32 v1, v2, v4, v3 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i16 %s, i16 %t, i16 %slice, i16 %fragid) { +; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v1 +; GFX9-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe da +; GFX9-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[5:6], v4, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: v_and_or_b32 v0, v0, v4, v1 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_and_or_b32 v1, v2, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v6, s11 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ; return to shader part epilog + %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 1, i32 0) + %v.vec = extractvalue { <4 x float>, i32 } %v, 0 + %v.err = extractvalue { <4 x float>, i32 } %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i16 %s, i16 %t, i16 %slice, i16 %fragid) { +; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v1 +; GFX9-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe lwe da +; GFX9-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[5:6], v4, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: v_and_or_b32 v0, v0, v4, v1 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_and_or_b32 v1, v2, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v6, s11 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 tfe lwe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ; return to shader part epilog + %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 3, i32 0) + %v.vec = extractvalue { <4 x float>, i32 } %v, 0 + %v.err = extractvalue { <4 x float>, i32 } %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i16(i32 immarg, i16, i16, i16, i16, <8 x i32>, i32 immarg, i32 immarg) #0 +declare { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i16(i32 immarg, i16, i16, i16, i16, <8 x i32>, i32 immarg, i32 immarg) #0 + +attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll new file mode 100644 index 0000000..38845b4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +; GFX6-LABEL: load_2darraymsaa_v4f32_xyzw: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +; GFX6-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:4], v[0:3], s[0:7] dmask:0xf unorm tfe da +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v4, off, s[8:11], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v6, s11 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:4], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ; return to shader part epilog + %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) + %v.vec = extractvalue { <4 x float>, i32 } %v, 0 + %v.err = extractvalue { <4 x float>, i32 } %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +; GFX6-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:4], v[0:3], s[0:7] dmask:0xf unorm tfe lwe da +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v4, off, s[8:11], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v6, s11 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:4], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe lwe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ; return to shader part epilog + %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0) + %v.vec = extractvalue { <4 x float>, i32 } %v, 0 + %v.err = extractvalue { <4 x float>, i32 } %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 immarg, i32, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 immarg, i32, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 + +attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll new file mode 100644 index 0000000..cc21bab --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %r) { +; GFX9-LABEL: load_3d_v4f32_xyzw: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: s_lshl_b32 s8, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s8 +; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_3d_v4f32_xyzw: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: s_lshl_b32 s8, s0, 16 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_and_or_b32 v1, v2, v3, s8 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i16 %s, i16 %t, i16 %r) { +; GFX9-LABEL: load_3d_v4f32_xyzw_tfe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: s_lshl_b32 s8, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s8 +; GFX9-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe +; GFX9-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[5:6], v4, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_3d_v4f32_xyzw_tfe: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: s_lshl_b32 s8, s0, 16 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_and_or_b32 v1, v2, v3, s8 +; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v6, s11 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ; return to shader part epilog + %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 1, i32 0) + %v.vec = extractvalue { <4 x float>, i32 } %v, 0 + %v.err = extractvalue { <4 x float>, i32 } %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i16 %s, i16 %t, i16 %r) { +; GFX9-LABEL: load_3d_v4f32_xyzw_tfe_lwe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: s_lshl_b32 s8, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s8 +; GFX9-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe lwe +; GFX9-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[5:6], v4, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: s_lshl_b32 s8, s0, 16 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_and_or_b32 v1, v2, v3, s8 +; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v6, s11 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ; return to shader part epilog + %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 3, i32 0) + %v.vec = extractvalue { <4 x float>, i32 } %v, 0 + %v.err = extractvalue { <4 x float>, i32 } %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 immarg, i16, i16, i16, <8 x i32>, i32 immarg, i32 immarg) #0 +declare { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 immarg, i16, i16, i16, <8 x i32>, i32 immarg, i32 immarg) #0 + +attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll new file mode 100644 index 0000000..e3da987 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) { +; GFX6-LABEL: load_3d_v4f32_xyzw: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_3d_v4f32_xyzw: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) { +; GFX6-LABEL: load_3d_v4f32_xyzw_tfe: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v4, off, s[8:11], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_3d_v4f32_xyzw_tfe: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v6, s11 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ; return to shader part epilog + %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 1, i32 0) + %v.vec = extractvalue { <4 x float>, i32 } %v, 0 + %v.err = extractvalue { <4 x float>, i32 } %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) { +; GFX6-LABEL: load_3d_v4f32_xyzw_tfe_lwe: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe lwe +; GFX6-NEXT: s_mov_b32 s8, s10 +; GFX6-NEXT: s_mov_b32 s9, s11 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v4, off, s[8:11], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v6, s11 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ; return to shader part epilog + %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0) + %v.vec = extractvalue { <4 x float>, i32 } %v, 0 + %v.err = extractvalue { <4 x float>, i32 } %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 immarg, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 immarg, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 + +attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll index c9af25c..f6d16e8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll @@ -1,821 +1,566 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX10 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %lod) { - ; GFX9-LABEL: name: sample_l_1d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: sample_l_1d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: sample_l_1d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v0, v0, v1, s12 +; GFX9-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_l_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, s12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half %s, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) { - ; GFX9-LABEL: name: sample_l_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: sample_l_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: sample_l_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_l_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half %s, half %t, half -0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %lod) { - ; GFX9-LABEL: name: sample_c_l_1d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: sample_c_l_1d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: sample_c_l_1d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v1, v1, v2, s12 +; GFX9-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_c_l_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, s12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float %zcompare, half %s, half -2.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) { - ; GFX9-LABEL: name: sample_c_l_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: sample_c_l_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: sample_c_l_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX9-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_c_l_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @sample_l_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, half %s, half %lod) { - ; GFX9-LABEL: name: sample_l_o_1d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.o.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: sample_l_o_1d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.o.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: sample_l_o_1d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v1, v1, v2, s12 +; GFX9-NEXT: image_sample_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_l_o_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, s12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_sample_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f16(i32 15, i32 %offset, half %s, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @sample_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, half %s, half %t, half %lod) { - ; GFX9-LABEL: name: sample_l_o_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.o.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: sample_l_o_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.o.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: sample_l_o_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX9-NEXT: image_sample_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_l_o_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_sample_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f16(i32 15, i32 %offset, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @sample_c_l_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %s, half %lod) { - ; GFX9-LABEL: name: sample_c_l_o_1d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.o.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: sample_c_l_o_1d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.o.1d), 15, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: sample_c_l_o_1d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v2, v2, v3, s12 +; GFX9-NEXT: image_sample_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_c_l_o_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: v_and_or_b32 v2, v2, 0xffff, s12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_sample_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f16(i32 15, i32 %offset, float %zcompare, half %s, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @sample_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %s, half %t, half %lod) { - ; GFX9-LABEL: name: sample_c_l_o_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.o.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: sample_c_l_o_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.o.2d), 15, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: sample_c_l_o_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 +; GFX9-NEXT: image_sample_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_c_l_o_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: v_and_or_b32 v2, v2, 0xffff, v3 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_sample_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.2d.v4f32.f16(i32 15, i32 %offset, float %zcompare, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) { - ; GFX9-LABEL: name: gather4_l_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.l.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: gather4_l_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.l.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_l_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: gather4_l_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f16(i32 15, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) { - ; GFX9-LABEL: name: gather4_c_l_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.l.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: gather4_c_l_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.l.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_c_l_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX9-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: gather4_c_l_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, half %s, half %t, half %lod) { - ; GFX9-LABEL: name: gather4_l_o_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.l.o.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: gather4_l_o_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) - ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.l.o.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_l_o_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX9-NEXT: image_gather4_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: gather4_l_o_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_gather4_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f16(i32 15, i32 %offset, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } define amdgpu_ps <4 x float> @gather4_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %s, half %t, half %lod) { - ; GFX9-LABEL: name: gather4_c_l_o_2d - ; GFX9: bb.1.main_body: - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.l.o.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-LABEL: name: gather4_c_l_o_2d - ; GFX10: bb.1.main_body: - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 - ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 - ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 - ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) - ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.l.o.2d), 15, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10: $vgpr0 = COPY [[UV]](s32) - ; GFX10: $vgpr1 = COPY [[UV1]](s32) - ; GFX10: $vgpr2 = COPY [[UV2]](s32) - ; GFX10: $vgpr3 = COPY [[UV3]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +; GFX9-LABEL: gather4_c_l_o_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 +; GFX9-NEXT: image_gather4_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: gather4_c_l_o_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: v_and_or_b32 v2, v2, 0xffff, v3 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_gather4_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f16(i32 15, i32 %offset, float %zcompare, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll new file mode 100644 index 0000000..a192b37 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -o - %s | FileCheck -check-prefix=UNPACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -o - %s | FileCheck -check-prefix=PACKED %s + +define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) { +; UNPACKED-LABEL: image_store_f16: +; UNPACKED: ; %bb.0: +; UNPACKED-NEXT: s_mov_b32 s0, s2 +; UNPACKED-NEXT: s_mov_b32 s1, s3 +; UNPACKED-NEXT: s_mov_b32 s2, s4 +; UNPACKED-NEXT: s_mov_b32 s3, s5 +; UNPACKED-NEXT: s_mov_b32 s4, s6 +; UNPACKED-NEXT: s_mov_b32 s5, s7 +; UNPACKED-NEXT: s_mov_b32 s6, s8 +; UNPACKED-NEXT: s_mov_b32 s7, s9 +; UNPACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm +; UNPACKED-NEXT: s_endpgm +; +; PACKED-LABEL: image_store_f16: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_mov_b32 s0, s2 +; PACKED-NEXT: s_mov_b32 s1, s3 +; PACKED-NEXT: s_mov_b32 s2, s4 +; PACKED-NEXT: s_mov_b32 s3, s5 +; PACKED-NEXT: s_mov_b32 s4, s6 +; PACKED-NEXT: s_mov_b32 s5, s7 +; PACKED-NEXT: s_mov_b32 s6, s8 +; PACKED-NEXT: s_mov_b32 s7, s9 +; PACKED-NEXT: s_nop 0 +; PACKED-NEXT: s_nop 0 +; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm +; PACKED-NEXT: s_endpgm + call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x half> %in) { +; UNPACKED-LABEL: image_store_v2f16: +; UNPACKED: ; %bb.0: +; UNPACKED-NEXT: s_mov_b32 s0, s2 +; UNPACKED-NEXT: s_mov_b32 s1, s3 +; UNPACKED-NEXT: s_mov_b32 s2, s4 +; UNPACKED-NEXT: s_mov_b32 s3, s5 +; UNPACKED-NEXT: s_mov_b32 s4, s6 +; UNPACKED-NEXT: s_mov_b32 s5, s7 +; UNPACKED-NEXT: s_mov_b32 s6, s8 +; UNPACKED-NEXT: s_mov_b32 s7, s9 +; UNPACKED-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; UNPACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm +; UNPACKED-NEXT: s_endpgm +; +; PACKED-LABEL: image_store_v2f16: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_mov_b32 s0, s2 +; PACKED-NEXT: s_mov_b32 s1, s3 +; PACKED-NEXT: s_mov_b32 s2, s4 +; PACKED-NEXT: s_mov_b32 s3, s5 +; PACKED-NEXT: s_mov_b32 s4, s6 +; PACKED-NEXT: s_mov_b32 s5, s7 +; PACKED-NEXT: s_mov_b32 s6, s8 +; PACKED-NEXT: s_mov_b32 s7, s9 +; PACKED-NEXT: s_nop 0 +; PACKED-NEXT: s_nop 0 +; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm +; PACKED-NEXT: s_endpgm + call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; FIXME: Broken +; define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <3 x half> %in) { +; call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) +; ret void +; } + +define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x half> %in) { +; UNPACKED-LABEL: image_store_v4f16: +; UNPACKED: ; %bb.0: +; UNPACKED-NEXT: v_mov_b32_e32 v6, v1 +; UNPACKED-NEXT: v_mov_b32_e32 v1, v2 +; UNPACKED-NEXT: s_mov_b32 s0, s2 +; UNPACKED-NEXT: s_mov_b32 s1, s3 +; UNPACKED-NEXT: s_mov_b32 s2, s4 +; UNPACKED-NEXT: s_mov_b32 s3, s5 +; UNPACKED-NEXT: s_mov_b32 s4, s6 +; UNPACKED-NEXT: s_mov_b32 s5, s7 +; UNPACKED-NEXT: s_mov_b32 s6, s8 +; UNPACKED-NEXT: s_mov_b32 s7, s9 +; UNPACKED-NEXT: v_mov_b32_e32 v5, v0 +; UNPACKED-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; UNPACKED-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; UNPACKED-NEXT: image_store v[1:4], v[5:6], s[0:7] dmask:0xf unorm +; UNPACKED-NEXT: s_endpgm +; +; PACKED-LABEL: image_store_v4f16: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_mov_b32 s0, s2 +; PACKED-NEXT: s_mov_b32 s1, s3 +; PACKED-NEXT: s_mov_b32 s2, s4 +; PACKED-NEXT: s_mov_b32 s3, s5 +; PACKED-NEXT: s_mov_b32 s4, s6 +; PACKED-NEXT: s_mov_b32 s5, s7 +; PACKED-NEXT: s_mov_b32 s6, s8 +; PACKED-NEXT: s_mov_b32 s7, s9 +; PACKED-NEXT: s_nop 0 +; PACKED-NEXT: s_nop 0 +; PACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm +; PACKED-NEXT: s_endpgm + call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.image.store.2d.f16.i32(half, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 + +attributes #0 = { nounwind writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll new file mode 100644 index 0000000..ed2ed40 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll @@ -0,0 +1,451 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_ps void @image_store_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, float %data) { +; GFX6-LABEL: image_store_f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: image_store_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: image_store_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm +; GFX10-NEXT: s_endpgm + call void @llvm.amdgcn.image.store.2d.f32.i32(float %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @image_store_v2f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x float> %in) { +; GFX6-LABEL: image_store_v2f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: image_store_v2f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: image_store_v2f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm +; GFX10-NEXT: s_endpgm + call void @llvm.amdgcn.image.store.2d.v2f32.i32(<2 x float> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @image_store_v3f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <3 x float> %in) { +; GFX6-LABEL: image_store_v3f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_store v[2:4], v[0:1], s[0:7] dmask:0x7 unorm +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: image_store_v3f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_store v[2:4], v[0:1], s[0:7] dmask:0x7 unorm +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: image_store_v3f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_store v[2:4], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm +; GFX10-NEXT: s_endpgm + call void @llvm.amdgcn.image.store.2d.v3f32.i32(<3 x float> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @image_store_v4f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x float> %in) { +; GFX6-LABEL: image_store_v4f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0xf unorm +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: image_store_v4f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0xf unorm +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: image_store_v4f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX10-NEXT: s_endpgm + call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @image_store_v4f32_dmask_0001(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x float> %in) { +; GFX6-LABEL: image_store_v4f32_dmask_0001: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x1 unorm +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: image_store_v4f32_dmask_0001: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x1 unorm +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: image_store_v4f32_dmask_0001: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm +; GFX10-NEXT: s_endpgm + call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @image_store_v4f32_dmask_0010(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x float> %in) { +; GFX6-LABEL: image_store_v4f32_dmask_0010: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x2 unorm +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: image_store_v4f32_dmask_0010: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x2 unorm +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: image_store_v4f32_dmask_0010: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D unorm +; GFX10-NEXT: s_endpgm + call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 2, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @image_store_v4f32_dmask_0100(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x float> %in) { +; GFX6-LABEL: image_store_v4f32_dmask_0100: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x4 unorm +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: image_store_v4f32_dmask_0100: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x4 unorm +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: image_store_v4f32_dmask_0100: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm +; GFX10-NEXT: s_endpgm + call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 4, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @image_store_v4f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x float> %in) { +; GFX6-LABEL: image_store_v4f32_dmask_1000: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x8 unorm +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: image_store_v4f32_dmask_1000: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x8 unorm +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: image_store_v4f32_dmask_1000: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D unorm +; GFX10-NEXT: s_endpgm + call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 8, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @image_store_v4f32_dmask_0011(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x float> %in) { +; GFX6-LABEL: image_store_v4f32_dmask_0011: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x3 unorm +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: image_store_v4f32_dmask_0011: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x3 unorm +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: image_store_v4f32_dmask_0011: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm +; GFX10-NEXT: s_endpgm + call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @image_store_v4f32_dmask_0110(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x float> %in) { +; GFX6-LABEL: image_store_v4f32_dmask_0110: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: s_mov_b32 s3, s5 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s8 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x6 unorm +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: image_store_v4f32_dmask_0110: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: s_mov_b32 s1, s3 +; GFX8-NEXT: s_mov_b32 s2, s4 +; GFX8-NEXT: s_mov_b32 s3, s5 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s8 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x6 unorm +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: image_store_v4f32_dmask_0110: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_2D unorm +; GFX10-NEXT: s_endpgm + call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 6, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.image.store.2d.f32.i32(float, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare void @llvm.amdgcn.image.store.2d.v2f32.i32(<2 x float>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare void @llvm.amdgcn.image.store.2d.v3f32.i32(<3 x float>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 +declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 + +attributes #0 = { nounwind writeonly }