const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
- bool isDSOffsetLegal(SDValue Base, unsigned Offset,
- unsigned OffsetBits) const;
+ bool isDSOffsetLegal(SDValue Base, unsigned Offset) const;
+ bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
+ unsigned Size) const;
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
SDValue &Offset1) const;
bool SelectDS128Bit8ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
SDValue &Offset1) const;
bool SelectDSReadWrite2(SDValue Ptr, SDValue &Base, SDValue &Offset0,
- SDValue &Offset1, bool IsDS128) const;
+ SDValue &Offset1, unsigned Size) const;
bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}
-bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset,
- unsigned OffsetBits) const {
- if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
- (OffsetBits == 8 && !isUInt<8>(Offset)))
+bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
+ if (!isUInt<16>(Offset))
return false;
- if (Subtarget->hasUsableDSOffset() ||
+ if (!Base || Subtarget->hasUsableDSOffset() ||
Subtarget->unsafeDSOffsetFoldingEnabled())
return true;
SDValue N0 = Addr.getOperand(0);
SDValue N1 = Addr.getOperand(1);
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
- if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) {
+ if (isDSOffsetLegal(N0, C1->getSExtValue())) {
// (add n0, c0)
Base = N0;
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
// sub C, x -> add (sub 0, x), C
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
int64_t ByteOffset = C->getSExtValue();
- if (isUInt<16>(ByteOffset)) {
+ if (isDSOffsetLegal(SDValue(), ByteOffset)) {
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
// XXX - This is kind of hacky. Create a dummy sub node so we can check
SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
Zero, Addr.getOperand(1));
- if (isDSOffsetLegal(Sub, ByteOffset, 16)) {
+ if (isDSOffsetLegal(Sub, ByteOffset)) {
SmallVector<SDValue, 3> Opnds;
Opnds.push_back(Zero);
Opnds.push_back(Addr.getOperand(1));
SDLoc DL(Addr);
- if (isUInt<16>(CAddr->getZExtValue())) {
+ if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
DL, MVT::i32, Zero);
return true;
}
+bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
+ unsigned Offset1,
+ unsigned Size) const {
+ if (Offset0 % Size != 0 || Offset1 % Size != 0)
+ return false;
+ if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
+ return false;
+
+ if (!Base || Subtarget->hasUsableDSOffset() ||
+ Subtarget->unsafeDSOffsetFoldingEnabled())
+ return true;
+
+ // On Southern Islands instruction with a negative base value and an offset
+ // don't seem to work.
+ return CurDAG->SignBitIsZero(Base);
+}
+
// TODO: If offset is too big, put low 16-bit into offset.
bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
SDValue &Offset0,
SDValue &Offset1) const {
- return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, false);
+ return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
}
bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
SDValue &Offset0,
SDValue &Offset1) const {
- return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, true);
+ return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
}
bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
SDValue &Offset0, SDValue &Offset1,
- bool IsDS128) const {
+ unsigned Size) const {
SDLoc DL(Addr);
- unsigned Align = IsDS128 ? 8 : 4;
if (CurDAG->isBaseWithConstantOffset(Addr)) {
SDValue N0 = Addr.getOperand(0);
SDValue N1 = Addr.getOperand(1);
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
- unsigned OffsetValue0 = C1->getZExtValue() / Align;
- unsigned OffsetValue1 = OffsetValue0 + 1;
+ unsigned OffsetValue0 = C1->getZExtValue();
+ unsigned OffsetValue1 = OffsetValue0 + Size;
+
// (add n0, c0)
- if (isDSOffsetLegal(N0, OffsetValue1, 8)) {
+ if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
Base = N0;
- Offset0 = CurDAG->getTargetConstant(OffsetValue0, DL, MVT::i8);
- Offset1 = CurDAG->getTargetConstant(OffsetValue1, DL, MVT::i8);
+ Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
+ Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
return true;
}
} else if (Addr.getOpcode() == ISD::SUB) {
// sub C, x -> add (sub 0, x), C
if (const ConstantSDNode *C =
dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
- unsigned OffsetValue0 = C->getZExtValue() / Align;
- unsigned OffsetValue1 = OffsetValue0 + 1;
+ unsigned OffsetValue0 = C->getZExtValue();
+ unsigned OffsetValue1 = OffsetValue0 + Size;
- if (isUInt<8>(OffsetValue0)) {
+ if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
SDLoc DL(Addr);
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
SDValue Sub =
CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
- if (isDSOffsetLegal(Sub, OffsetValue1, 8)) {
+ if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
SmallVector<SDValue, 3> Opnds;
Opnds.push_back(Zero);
Opnds.push_back(Addr.getOperand(1));
}
MachineSDNode *MachineSub = CurDAG->getMachineNode(
- SubOp, DL, (IsDS128 ? MVT::i64 : MVT::i32), Opnds);
+ SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
Base = SDValue(MachineSub, 0);
- Offset0 = CurDAG->getTargetConstant(OffsetValue0, DL, MVT::i8);
- Offset1 = CurDAG->getTargetConstant(OffsetValue1, DL, MVT::i8);
+ Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
+ Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
return true;
}
}
}
} else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
- unsigned OffsetValue0 = CAddr->getZExtValue() / Align;
- unsigned OffsetValue1 = OffsetValue0 + 1;
- bool OffsetIsAligned = Align * OffsetValue0 == CAddr->getZExtValue();
+ unsigned OffsetValue0 = CAddr->getZExtValue();
+ unsigned OffsetValue1 = OffsetValue0 + Size;
- if (isUInt<8>(OffsetValue0) && isUInt<8>(OffsetValue1) && OffsetIsAligned) {
+ if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
MachineSDNode *MovZero =
CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
Base = SDValue(MovZero, 0);
- Offset0 = CurDAG->getTargetConstant(OffsetValue0, DL, MVT::i8);
- Offset1 = CurDAG->getTargetConstant(OffsetValue1, DL, MVT::i8);
+ Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
+ Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
return true;
}
}
SDValue PtrOffset = Ptr.getOperand(1);
const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
- if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) {
+ if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
N = glueCopyToM0(N, PtrBase);
Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
}
std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
// TODO: Should this try to look through readfirstlane like GWS?
- if (!isDSOffsetLegal(PtrBase, Offset, 16)) {
+ if (!isDSOffsetLegal(PtrBase, Offset)) {
PtrBase = MI.getOperand(2).getReg();
Offset = 0;
}
}
bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
- int64_t Offset,
- unsigned OffsetBits) const {
- if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
- (OffsetBits == 8 && !isUInt<8>(Offset)))
+ int64_t Offset) const {
+ if (!isUInt<16>(Offset))
+ return false;
+
+ if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
+ return true;
+
+ // On Southern Islands instruction with a negative base value and an offset
+ // don't seem to work.
+ return KnownBits->signBitIsZero(Base);
+}
+
+bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
+ int64_t Offset1,
+ unsigned Size) const {
+ if (Offset0 % Size != 0 || Offset1 % Size != 0)
+ return false;
+ if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
return false;
if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
if (Offset) {
- if (isDSOffsetLegal(PtrBase, Offset, 16)) {
+ if (isDSOffsetLegal(PtrBase, Offset)) {
// (add n0, c0)
return std::make_pair(PtrBase, Offset);
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
- return selectDSReadWrite2(Root, false);
+ return selectDSReadWrite2(Root, 4);
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
- return selectDSReadWrite2(Root, true);
+ return selectDSReadWrite2(Root, 8);
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
- bool IsDS128) const {
+ unsigned Size) const {
Register Reg;
unsigned Offset;
- std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, IsDS128);
+ std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
- bool IsDS128) const {
+ unsigned Size) const {
const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
if (!RootDef)
return std::make_pair(Root.getReg(), 0);
getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
if (Offset) {
- int64_t OffsetValue0 = Offset / (IsDS128 ? 8 : 4);
- int64_t OffsetValue1 = OffsetValue0 + 1;
- if (isDSOffsetLegal(PtrBase, OffsetValue1, (IsDS128 ? 16 : 8))) {
+ int64_t OffsetValue0 = Offset;
+ int64_t OffsetValue1 = Offset + Size;
+ if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
// (add n0, c0)
- return std::make_pair(PtrBase, OffsetValue0);
+ return std::make_pair(PtrBase, OffsetValue0 / Size);
}
} else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
// TODO
InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffset(MachineOperand &Root) const;
- bool isDSOffsetLegal(Register Base, int64_t Offset,
- unsigned OffsetBits) const;
+ bool isDSOffsetLegal(Register Base, int64_t Offset) const;
+ bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1,
+ unsigned Size) const;
std::pair<Register, unsigned>
selectDS1Addr1OffsetImpl(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectDS128Bit8ByteAligned(MachineOperand &Root) const;
- std::pair<Register, unsigned>
- selectDSReadWrite2Impl(MachineOperand &Root, bool IsDS128) const;
+ std::pair<Register, unsigned> selectDSReadWrite2Impl(MachineOperand &Root,
+ unsigned size) const;
InstructionSelector::ComplexRendererFns
- selectDSReadWrite2(MachineOperand &Root, bool IsDS128) const;
+ selectDSReadWrite2(MachineOperand &Root, unsigned size) const;
std::pair<Register, int64_t>
getPtrBaseWithConstantOffset(Register Root,
; GFX7-LABEL: name: load_local_v4s32_align_8_offset_320
; GFX7: liveins: $vgpr0
; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4000, implicit $exec
+ ; GFX7: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
; GFX7: $m0 = S_MOV_B32 -1
- ; GFX7: [[DS_READ2_B64_:%[0-9]+]]:vreg_128 = DS_READ2_B64 [[COPY]], 500, 501, 0, implicit $m0, implicit $exec :: (load 16, align 8, addrspace 3)
+ ; GFX7: [[DS_READ2_B64_:%[0-9]+]]:vreg_128 = DS_READ2_B64 %2, 0, 1, 0, implicit $m0, implicit $exec :: (load 16, align 8, addrspace 3)
; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_]]
; GFX9-LABEL: name: load_local_v4s32_align_8_offset_320
; GFX9: liveins: $vgpr0
; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
; CI: ; %bb.0:
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
+; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0
; CI-NEXT: v_mov_b32_e32 v1, 0x7b
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: ds_write2_b32 v0, v1, v2 offset0:254 offset1:255
+; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
+; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fb, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:254 offset1:255
+; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; GFX9-NEXT: s_endpgm
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[0:1], 0x9
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
+; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0
; CI-NEXT: s_mov_b64 vcc, 0
; CI-NEXT: v_mov_b32_e32 v2, 0x7b
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s1, s0
-; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:254 offset1:255
+; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; CI-NEXT: s_endpgm
;
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 vcc, 0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
+; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fb, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
-; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset0:254 offset1:255
+; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2
-; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset0:1 offset1:2
+; GFX9-UNALIGNED-NEXT: v_add3_u32 v0, s0, v2, 5
+; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s0, v2
+; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v3, 5, v2
+; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, 9, v2
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX9-UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset0:1 offset1:2
-; GFX9-UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset0:2 offset1:3
+; GFX9-UNALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1
+; GFX9-UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
; GFX9-UNALIGNED-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i