return SDValue(NewNode, 0);
}
+SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
+ SDValue Offset, SDValue GLC,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ VT.getStoreSize(), VT.getStoreSize());
+
+ if (!Offset->isDivergent()) {
+ SDValue Ops[] = {
+ Rsrc,
+ Offset, // Offset
+ GLC // glc
+ };
+ return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
+ DAG.getVTList(VT), Ops, VT, MMO);
+ }
+
+ // We have a divergent offset. Emit a MUBUF buffer load instead. We can
+ // assume that the buffer is unswizzled.
+ SmallVector<SDValue, 4> Loads;
+ unsigned NumLoads = 1;
+ MVT LoadVT = VT.getSimpleVT();
+
+ assert(LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 ||
+ LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32);
+
+ if (VT == MVT::v8i32 || VT == MVT::v16i32) {
+ NumLoads = VT == MVT::v16i32 ? 4 : 2;
+ LoadVT = MVT::v4i32;
+ }
+
+ SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
+ unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
+ SDValue Ops[] = {
+ DAG.getEntryNode(), // Chain
+ Rsrc, // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ {}, // voffset
+ {}, // soffset
+ {}, // offset
+ DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
+ DAG.getConstant(0, DL, MVT::i1), // idxen
+ };
+
+ // Use the alignment to ensure that the required offsets will fit into the
+ // immediate offsets.
+ setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
+
+ uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
+ for (unsigned i = 0; i < NumLoads; ++i) {
+ Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
+ Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
+ Ops, LoadVT, MMO));
+ }
+
+ if (VT == MVT::v8i32 || VT == MVT::v16i32)
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
+
+ return Loads[0];
+}
+
SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDZ);
case AMDGPUIntrinsic::SI_load_const: {
- SDValue Ops[] = {
- Op.getOperand(1), // Ptr
- Op.getOperand(2), // Offset
- DAG.getTargetConstant(0, DL, MVT::i1) // glc
- };
-
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant,
- VT.getStoreSize(), 4);
- SDVTList VTList = DAG.getVTList(MVT::i32);
- SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
- VTList, Ops, MVT::i32, MMO);
-
+ SDValue Load =
+ lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2),
+ DAG.getTargetConstant(0, DL, MVT::i1), DAG);
return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
}
case Intrinsic::amdgcn_s_buffer_load: {
unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
- SDValue Ops[] = {
- Op.getOperand(1), // Ptr
- Op.getOperand(2), // Offset
- DAG.getTargetConstant(Cache & 1, DL, MVT::i1) // glc
- };
-
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant,
- VT.getStoreSize(), VT.getStoreSize());
- return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
- Op->getVTList(), Ops, VT, MMO);
+ return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
+ DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
}
case Intrinsic::amdgcn_fdiv_fast:
return lowerFDIV_FAST(Op, DAG);
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
- SelectionDAG &DAG,
- SDValue *Offsets) const {
+ SelectionDAG &DAG, SDValue *Offsets,
+ unsigned Align) const {
SDLoc DL(CombinedOffset);
if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
uint32_t Imm = C->getZExtValue();
uint32_t SOffset, ImmOffset;
- if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget)) {
+ if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
SDValue N1 = CombinedOffset.getOperand(1);
uint32_t SOffset, ImmOffset;
int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
- if (Offset >= 0
- && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, Subtarget)) {
+ if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
+ Subtarget, Align)) {
Offsets[0] = N0;
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
MVT VT, unsigned Offset) const;
SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
SelectionDAG &DAG) const;
+ SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
+ SDValue GLC, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
- SDValue *Offsets) const;
+ SDValue *Offsets, unsigned Align = 4) const;
public:
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI);
// pointer value is uniform.
MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
- unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
- SBase->setReg(SGPR);
+ unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
+ SBase->setReg(SGPR);
+ }
+ MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
+ if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
+ unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
+ SOff->setReg(SGPR);
}
}
splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
Inst.eraseFromParent();
continue;
-
- case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
- case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
- case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
- case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
- case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
- unsigned VDst;
- unsigned NewOpcode;
-
- switch(Opcode) {
- case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
- NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
- VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- break;
- case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
- NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
- VDst = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
- break;
- case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
- NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
- VDst = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
- break;
- case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
- case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
- splitScalarBuffer(Worklist, Inst);
- Inst.eraseFromParent();
- continue;
- }
-
- const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
- auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
- unsigned Offset = 0;
-
- // FIXME: This isn't safe because the addressing mode doesn't work
- // correctly if vaddr is negative.
- //
- // FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
- //
- // See if we can extract an immediate offset by recognizing one of these:
- // V_ADD_I32_e32 dst, imm, src1
- // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
- // V_ADD will be removed by "Remove dead machine instructions".
- if (Add &&
- (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 ||
- Add->getOpcode() == AMDGPU::V_ADD_U32_e32 ||
- Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) {
- static const unsigned SrcNames[2] = {
- AMDGPU::OpName::src0,
- AMDGPU::OpName::src1,
- };
-
- // Find a literal offset in one of source operands.
- for (int i = 0; i < 2; i++) {
- const MachineOperand *Src =
- getNamedOperand(*Add, SrcNames[i]);
-
- if (Src->isReg()) {
- MachineInstr *Def = MRI.getUniqueVRegDef(Src->getReg());
- if (Def) {
- if (Def->isMoveImmediate())
- Src = &Def->getOperand(1);
- else if (Def->isCopy()) {
- auto Mov = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
- if (Mov && Mov->isMoveImmediate()) {
- Src = &Mov->getOperand(1);
- }
- }
- }
- }
-
- if (Src) {
- if (Src->isImm())
- Offset = Src->getImm();
- else if (Src->isCImm())
- Offset = Src->getCImm()->getZExtValue();
- }
-
- if (Offset && isLegalMUBUFImmOffset(Offset)) {
- VAddr = getNamedOperand(*Add, SrcNames[!i]);
- break;
- }
-
- Offset = 0;
- }
- }
-
- MachineInstr *NewInstr =
- BuildMI(*MBB, Inst, Inst.getDebugLoc(),
- get(NewOpcode), VDst)
- .add(*VAddr) // vaddr
- .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
- .addImm(0) // soffset
- .addImm(Offset) // offset
- .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
- .addImm(0) // slc
- .addImm(0) // tfe
- .cloneMemRefs(Inst)
- .getInstr();
-
- MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
- VDst);
- addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
- Inst.eraseFromParent();
-
- // Legalize all operands other than the offset. Notably, convert the srsrc
- // into SGPRs using v_readfirstlane if needed.
- legalizeOperands(*NewInstr, MDT);
- continue;
- }
}
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
-void SIInstrInfo::splitScalarBuffer(SetVectorType &Worklist,
- MachineInstr &Inst) const {
- MachineBasicBlock &MBB = *Inst.getParent();
- MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-
- MachineBasicBlock::iterator MII = Inst;
- auto &DL = Inst.getDebugLoc();
-
- MachineOperand &Dest = *getNamedOperand(Inst, AMDGPU::OpName::sdst);;
- MachineOperand &Rsrc = *getNamedOperand(Inst, AMDGPU::OpName::sbase);
- MachineOperand &Offset = *getNamedOperand(Inst, AMDGPU::OpName::soff);
- MachineOperand &Glc = *getNamedOperand(Inst, AMDGPU::OpName::glc);
-
- unsigned Opcode = Inst.getOpcode();
- unsigned NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
- unsigned Count = 0;
- const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
- const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
-
- switch(Opcode) {
- default:
- return;
- case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
- Count = 2;
- break;
- case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
- Count = 4;
- break;
- }
-
- // FIXME: Should also attempt to build VAddr and Offset like the non-split
- // case (see call site for this function)
-
- // Create a vector of result registers
- SmallVector<unsigned, 8> ResultRegs;
- for (unsigned i = 0; i < Count ; ++i) {
- unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
- MachineInstr &NewMI = *BuildMI(MBB, MII, DL, get(NewOpcode), ResultReg)
- .addReg(Offset.getReg()) // offset
- .addReg(Rsrc.getReg()) // rsrc
- .addImm(0) // soffset
- .addImm(i << 4) // inst_offset
- .addImm(Glc.getImm()) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .addMemOperand(*Inst.memoperands_begin());
- // Extract the 4 32 bit sub-registers from the result to add into the final REG_SEQUENCE
- auto &NewDestOp = NewMI.getOperand(0);
- for (unsigned i = 0 ; i < 4 ; i++)
- ResultRegs.push_back(buildExtractSubReg(MII, MRI, NewDestOp, &AMDGPU::VReg_128RegClass,
- RI.getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass));
- }
- // Create a new combined result to replace original with
- unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
- MachineInstrBuilder CombinedResBuilder = BuildMI(MBB, MII, DL,
- get(TargetOpcode::REG_SEQUENCE), FullDestReg);
-
- for (unsigned i = 0 ; i < Count * 4 ; ++i) {
- CombinedResBuilder
- .addReg(ResultRegs[i])
- .addImm(RI.getSubRegFromChannel(i));
- }
-
- MRI.replaceRegWith(Dest.getReg(), FullDestReg);
- addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
-}
-
void SIInstrInfo::addUsersToMoveToVALUWorklist(
unsigned DstReg,
MachineRegisterInfo &MRI,
MachineInstr &Inst) const;
void splitScalar64BitBFE(SetVectorType &Worklist,
MachineInstr &Inst) const;
- void splitScalarBuffer(SetVectorType &Worklist,
- MachineInstr &Inst) const;
void movePackToVALU(SetVectorType &Worklist,
MachineRegisterInfo &MRI,
MachineInstr &Inst) const;
// Given Imm, split it into the values to put into the SOffset and ImmOffset
// fields in an MUBUF instruction. Return false if it is not possible (due to a
// hardware bug needing a workaround).
+//
+// The required alignment ensures that individual address components remain
+// aligned if they are aligned to begin with. It also ensures that additional
+// offsets within the given alignment can be added to the resulting ImmOffset.
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
- const GCNSubtarget *Subtarget) {
- const uint32_t Align = 4;
+ const GCNSubtarget *Subtarget, uint32_t Align) {
const uint32_t MaxImm = alignDown(4095, Align);
uint32_t Overflow = 0;
/// not the encoded offset.
bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
-// Given Imm, split it into the values to put into the SOffset and ImmOffset
-// fields in an MUBUF instruction. Return false if it is not possible (due to a
-// hardware bug needing a workaround).
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
- const GCNSubtarget *Subtarget);
+ const GCNSubtarget *Subtarget, uint32_t Align = 4);
/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);
# RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies -o - %s | FileCheck -check-prefix=GCN %s
-# GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095
+# GCN-LABEL: name: smrd_vgpr_offset_imm
+# GCN: V_READFIRSTLANE_B32
+# GCN: S_BUFFER_LOAD_DWORD_SGPR
---
name: smrd_vgpr_offset_imm
body: |
SI_RETURN_TO_EPILOG $vgpr0
...
-# GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095
+# GCN-LABEL: name: smrd_vgpr_offset_imm_add_u32
+# GCN: V_READFIRSTLANE_B32
+# GCN: S_BUFFER_LOAD_DWORD_SGPR
---
name: smrd_vgpr_offset_imm_add_u32
body: |
; GCN-LABEL: {{^}}smrd_vgpr_offset_imm:
; GCN-NEXT: %bb.
-; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4095 ;
+; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4092 ;
define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 {
main_body:
- %off = add i32 %offset, 4095
+ %off = add i32 %offset, 4092
%r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %off)
ret float %r
}
; GCN-LABEL: {{^}}smrd_vgpr_offset_imm_too_large:
; GCN-NEXT: %bb.
-; GCN-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
-; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
+; SICI-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
+; SICI-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
+; VIGFX9-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 4 offen offset:4092 ;
define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 {
main_body:
%off = add i32 %offset, 4096
ret void
}
+; SMRD load with a non-const non-uniform offset of > 4 dwords (requires splitting)
+; GCN-LABEL: {{^}}smrd_load_nonconst3:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
+; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
+; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
+; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
+; GCN: ; return to shader part epilog
+define amdgpu_ps <16 x float> @smrd_load_nonconst3(<4 x i32> inreg %rsrc, i32 %off) #0 {
+main_body:
+ %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off, i32 0)
+ %bc = bitcast <16 x i32> %ld to <16 x float>
+ ret <16 x float> %bc
+}
+
+; GCN-LABEL: {{^}}smrd_load_nonconst4:
+; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ;
+; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
+; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
+; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
+; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 56 offen offset:4032 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 56 offen offset:4048 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 56 offen offset:4064 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 56 offen offset:4080 ;
+; GCN: ; return to shader part epilog
+define amdgpu_ps <16 x float> @smrd_load_nonconst4(<4 x i32> inreg %rsrc, i32 %off) #0 {
+main_body:
+ %off.2 = add i32 %off, 4088
+ %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off.2, i32 0)
+ %bc = bitcast <16 x i32> %ld to <16 x float>
+ ret <16 x float> %bc
+}
+
+; GCN-LABEL: {{^}}smrd_load_nonconst5:
+; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x1004, v0
+; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
+; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
+; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
+; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
+; VIGFX9: s_movk_i32 s4, 0xfc0
+; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], s4 offen offset:68 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], s4 offen offset:84 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], s4 offen offset:100 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], s4 offen offset:116 ;
+; GCN: ; return to shader part epilog
+define amdgpu_ps <16 x float> @smrd_load_nonconst5(<4 x i32> inreg %rsrc, i32 %off) #0 {
+main_body:
+ %off.2 = add i32 %off, 4100
+ %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off.2, i32 0)
+ %bc = bitcast <16 x i32> %ld to <16 x float>
+ ret <16 x float> %bc
+}
+
; SMRD load dwordx2
; GCN-LABEL: {{^}}smrd_load_dwordx2:
; SIVIGFX9: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
; GCN-LABEL: {{^}}smrd_uniform_loop:
;
-; TODO: this should use an s_buffer_load
+; TODO: we should keep the loop counter in an SGPR
;
-; GCN: buffer_load_dword
+; GCN: v_readfirstlane_b32
+; GCN: s_buffer_load_dword
define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 {
main_body:
br label %loop