bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
+ bool selectMOVRELOffsetImpl(SDValue Index, SDValue &Base,
+ SDValue &Offset, bool IsInsert) const;
+ bool selectMOVRELSOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
+ bool selectMOVRELDOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
!isa<ConstantSDNode>(Offset);
}
+bool AMDGPUDAGToDAGISel::selectMOVRELOffsetImpl(SDValue Index,
+ SDValue &Base,
+ SDValue &Offset,
+ bool IsInsert) const {
+ SDLoc DL(Index);
+
+ if (CurDAG->isBaseWithConstantOffset(Index)) {
+ SDValue N0 = Index.getOperand(0);
+ SDValue N1 = Index.getOperand(1);
+ ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+
+ // (add n0, c0)
+ Base = N0;
+ Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
+ return true;
+ }
+
+ if (IsInsert) {
+ if (ConstantSDNode *CBase = dyn_cast<ConstantSDNode>(Index)) {
+ Base = CurDAG->getRegister(AMDGPU::NoRegister, MVT::i32);
+ Offset = CurDAG->getTargetConstant(CBase->getZExtValue(), DL, MVT::i32);
+ return true;
+ }
+ } else {
+ if (isa<ConstantSDNode>(Index))
+ return false;
+ }
+
+ Base = Index;
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::selectMOVRELSOffset(SDValue Index,
+ SDValue &Base,
+ SDValue &Offset) const {
+ return selectMOVRELOffsetImpl(Index, Base, Offset, false);
+}
+
+bool AMDGPUDAGToDAGISel::selectMOVRELDOffset(SDValue Index,
+ SDValue &Base,
+ SDValue &Offset) const {
+ return selectMOVRELOffsetImpl(Index, Base, Offset, true);
+}
+
SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL,
SDValue Val, uint32_t Offset,
uint32_t Width) {
if (RegClass != -1) {
unsigned Reg = MI.getOperand(i).getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg))
+ if (Reg == AMDGPU::NoRegister ||
+ TargetRegisterInfo::isVirtualRegister(Reg))
continue;
const TargetRegisterClass *RC = RI.getRegClass(RegClass);
def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
def SMRDBufferSgpr : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">;
+def MOVRELSOffset : ComplexPattern<i32, 2, "selectMOVRELSOffset">;
+def MOVRELDOffset : ComplexPattern<i32, 2, "selectMOVRELDOffset">;
+
def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;
def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
class SI_INDIRECT_SRC<RegisterClass rc> : InstSI <
(outs VGPR_32:$vdst, SReg_64:$sdst),
- (ins rc:$src, VSrc_32:$idx, i32imm:$offset)
->;
+ (ins rc:$src, VS_32:$idx, i32imm:$offset)>;
class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
(outs rc:$vdst, SReg_64:$sdst),
- (ins unknown:$src, VSrc_32:$idx, i32imm:$offset, VGPR_32:$val)> {
+ (ins unknown:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> {
let Constraints = "$src = $vdst";
}
/********** ====================== **********/
multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
-
- // 1. Extract with offset
- def : Pat<
- (eltvt (extractelt vt:$vec, (add i32:$idx, imm:$off))),
- (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, imm:$off)
- >;
-
- // 2. Extract without offset
- def : Pat<
- (eltvt (extractelt vt:$vec, i32:$idx)),
- (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, 0)
- >;
-
- // 3. Insert with offset
+ // Extract with offset
def : Pat<
- (insertelt vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)),
- (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, imm:$off, $val)
+ (eltvt (extractelt vt:$src, (MOVRELSOffset i32:$idx, (i32 imm:$offset)))),
+ (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset)
>;
- // 4. Insert without offset
+ // Insert with offset
def : Pat<
- (insertelt vt:$vec, eltvt:$val, i32:$idx),
- (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, 0, $val)
+ (insertelt vt:$src, eltvt:$val, (MOVRELDOffset i32:$idx, (i32 imm:$offset))),
+ (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val)
>;
}
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
.addReg(AMDGPU::VCC);
- if (Offset) {
+ if (Offset != 0) {
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
.addReg(AMDGPU::M0)
.addImm(Offset);
const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) {
- if (Offset) {
+ if (Offset != 0) {
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
.addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()))
.addImm(Offset);
return true;
}
-/// \param @VecReg The register which holds element zero of the vector
-/// being addressed into.
-/// \param[out] @Reg The base register to use in the indirect addressing instruction.
-/// \param[in,out] @Offset As an input, this is the constant offset part of the
-// indirect Index. e.g. v0 = v[VecReg + Offset]
-// As an output, this is a constant value that needs
-// to be added to the value stored in M0.
+/// \param @VecReg The register which holds element zero of the vector being
+/// addressed into.
+//
+/// \param[in] @Idx The index operand from the movrel instruction. This must be
+// a register, but may be NoRegister.
+///
+/// \param[in] @Offset As an input, this is the constant offset part of the
+// indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant
+// value that needs to be added to the value stored in M0.
std::pair<unsigned, int>
-SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg,
- int Offset) const {
+SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const {
unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
if (!SubReg)
SubReg = VecReg;
// Return true if a new block was inserted.
bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
- int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
+ int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
unsigned Reg;
- std::tie(Reg, Off) = computeIndirectRegAndOffset(SrcVec->getReg(), Off);
+ std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset);
+
+ const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+ if (Idx->getReg() == AMDGPU::NoRegister) {
+ // Only had a constant offset, copy the register directly.
+ BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
+ .addReg(Reg, getUndefRegState(SrcVec->isUndef()));
+ MI.eraseFromParent();
+ return false;
+ }
MachineInstr *MovRel =
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
.addReg(Reg, getUndefRegState(SrcVec->isUndef()))
.addReg(SrcVec->getReg(), RegState::Implicit);
- return loadM0(MI, MovRel, Off);
+ return loadM0(MI, MovRel, Offset);
}
// Return true if a new block was inserted.
bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
- int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
- MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
+ int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
unsigned Reg;
- std::tie(Reg, Off) = computeIndirectRegAndOffset(Dst, Off);
+ const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
+ std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset);
+
+ MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+ if (Idx->getReg() == AMDGPU::NoRegister) {
+ // Only had a constant offset, copy the register directly.
+ BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg)
+ .addOperand(*Val);
+ MI.eraseFromParent();
+ return false;
+ }
MachineInstr *MovRel =
- BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
- .addReg(Reg, RegState::Define)
+ BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg)
.addReg(Val->getReg(), getUndefRegState(Val->isUndef()))
.addReg(Dst, RegState::Implicit);
- return loadM0(MI, MovRel, Off);
+ return loadM0(MI, MovRel, Offset);
}
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
ret void
}
+; Test that the or is folded into the base address register instead of
+; added to m0
+
+; GCN-LABEL: {{^}}extractelement_v4i32_or_index:
+; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
+; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
+; GCN-NOT: [[IDX_SHL]]
+; GCN: s_mov_b32 m0, [[IDX_SHL]]
+; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+define void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) {
+entry:
+ %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+ %idx.shl = shl i32 %idx.in, 2
+ %idx = or i32 %idx.shl, 1
+ %value = extractelement <4 x i32> %ld, i32 %idx
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}insertelement_v4f32_or_index:
+; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
+; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
+; GCN-NOT: [[IDX_SHL]]
+; GCN: s_mov_b32 m0, [[IDX_SHL]]
+; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
+ %idx.shl = shl i32 %idx.in, 2
+ %idx = or i32 %idx.shl, 1
+ %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx
+ store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
; FIXME: Should be able to manipulate m0 directly instead of add and
; copy.
-; GCN: s_or_b32 [[IDX1:s[0-9]+]], [[SCALEDIDX]], 1
+; FIXME: Should avoid resetting m0 to same value
; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
-; GCN-DAG: s_mov_b32 m0, [[IDX1]]
+; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]]
; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
; GCN: buffer_store_dwordx4