SDValue visitZERO_EXTEND(SDNode *N);
SDValue visitANY_EXTEND(SDNode *N);
SDValue visitAssertExt(SDNode *N);
+ SDValue visitAssertAlign(SDNode *N);
SDValue visitSIGN_EXTEND_INREG(SDNode *N);
SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N);
SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N);
case ISD::ANY_EXTEND: return visitANY_EXTEND(N);
case ISD::AssertSext:
case ISD::AssertZext: return visitAssertExt(N);
+ case ISD::AssertAlign: return visitAssertAlign(N);
case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N);
case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N);
case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N);
return SDValue();
}
+SDValue DAGCombiner::visitAssertAlign(SDNode *N) {
+ SDLoc DL(N);
+
+ Align AL = cast<AssertAlignSDNode>(N)->getAlign();
+ SDValue N0 = N->getOperand(0);
+
+ // Fold (assertalign (assertalign x, AL0), AL1) ->
+ // (assertalign x, max(AL0, AL1))
+ if (auto *AAN = dyn_cast<AssertAlignSDNode>(N0))
+ return DAG.getAssertAlign(DL, N0.getOperand(0),
+ std::max(AL, AAN->getAlign()));
+
+ // In rare cases, there are trivial arithmetic ops in source operands. Sink
+ // this assert down to source operands so that those arithmetic ops could be
+ // exposed to the DAG combining.
+ switch (N0.getOpcode()) {
+ default:
+ break;
+ case ISD::ADD:
+ case ISD::SUB: {
+ unsigned AlignShift = Log2(AL);
+ SDValue LHS = N0.getOperand(0);
+ SDValue RHS = N0.getOperand(1);
+ unsigned LHSAlignShift = DAG.computeKnownBits(LHS).countMinTrailingZeros();
+ unsigned RHSAlignShift = DAG.computeKnownBits(RHS).countMinTrailingZeros();
+ if (LHSAlignShift >= AlignShift || RHSAlignShift >= AlignShift) {
+ if (LHSAlignShift < AlignShift)
+ LHS = DAG.getAssertAlign(DL, LHS, AL);
+ if (RHSAlignShift < AlignShift)
+ RHS = DAG.getAssertAlign(DL, RHS, AL);
+ return DAG.getNode(N0.getOpcode(), DL, N0.getValueType(), LHS, RHS);
+ }
+ break;
+ }
+ }
+
+ return SDValue();
+}
+
/// If the result of a wider load is shifted to right of N bits and then
/// truncated to a narrower type and where N is a multiple of number of bits of
/// the narrower type, transform it to a narrower load from address + N / num of
llvm_unreachable("cannot find MemSDNode in the pattern!");
}
+static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
+ SDValue &N0, SDValue &N1) {
+ if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
+ Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+ // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
+ // (i64 (bitcast (v2i32 (build_vector
+ // (or (extract_vector_elt V, 0), OFFSET),
+ // (extract_vector_elt V, 1)))))
+ SDValue Lo = Addr.getOperand(0).getOperand(0);
+ if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
+ SDValue BaseLo = Lo.getOperand(0);
+ SDValue BaseHi = Addr.getOperand(0).getOperand(1);
+ // Check that split base (Lo and Hi) are extracted from the same one.
+ if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
+ // Lo is statically extracted from index 0.
+ isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
+ BaseLo.getConstantOperandVal(1) == 0 &&
+ // Hi is statically extracted from index 0.
+ isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
+ BaseHi.getConstantOperandVal(1) == 1) {
+ N0 = BaseLo.getOperand(0).getOperand(0);
+ N1 = Lo.getOperand(1);
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
template <bool IsSigned>
bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
SDValue Addr,
if (Subtarget->hasFlatInstOffsets() &&
(!Subtarget->hasFlatSegmentOffsetBug() ||
- findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) &&
- CurDAG->isBaseWithConstantOffset(Addr)) {
- SDValue N0 = Addr.getOperand(0);
- SDValue N1 = Addr.getOperand(1);
- uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
-
- const SIInstrInfo *TII = Subtarget->getInstrInfo();
- unsigned AS = findMemSDNode(N)->getAddressSpace();
- if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
- Addr = N0;
- OffsetVal = COffsetVal;
- } else {
- // If the offset doesn't fit, put the low bits into the offset field and
- // add the rest.
-
- SDLoc DL(N);
- uint64_t ImmField;
- const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
- if (IsSigned) {
- ImmField = SignExtend64(COffsetVal, NumBits);
-
- // Don't use a negative offset field if the base offset is positive.
- // Since the scheduler currently relies on the offset field, doing so
- // could result in strange scheduling decisions.
-
- // TODO: Should we not do this in the opposite direction as well?
- if (static_cast<int64_t>(COffsetVal) > 0) {
- if (static_cast<int64_t>(ImmField) < 0) {
- const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits - 1);
- ImmField = COffsetVal & OffsetMask;
+ findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) {
+ SDValue N0, N1;
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ N0 = Addr.getOperand(0);
+ N1 = Addr.getOperand(1);
+ } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
+ assert(N0 && N1 && isa<ConstantSDNode>(N1));
+ }
+ if (N0 && N1) {
+ uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
+
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ unsigned AS = findMemSDNode(N)->getAddressSpace();
+ if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
+ Addr = N0;
+ OffsetVal = COffsetVal;
+ } else {
+ // If the offset doesn't fit, put the low bits into the offset field and
+ // add the rest.
+
+ SDLoc DL(N);
+ uint64_t ImmField;
+ const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
+ if (IsSigned) {
+ ImmField = SignExtend64(COffsetVal, NumBits);
+
+ // Don't use a negative offset field if the base offset is positive.
+ // Since the scheduler currently relies on the offset field, doing so
+ // could result in strange scheduling decisions.
+
+ // TODO: Should we not do this in the opposite direction as well?
+ if (static_cast<int64_t>(COffsetVal) > 0) {
+ if (static_cast<int64_t>(ImmField) < 0) {
+ const uint64_t OffsetMask =
+ maskTrailingOnes<uint64_t>(NumBits - 1);
+ ImmField = COffsetVal & OffsetMask;
+ }
}
+ } else {
+ // TODO: Should we do this for a negative offset?
+ const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits);
+ ImmField = COffsetVal & OffsetMask;
}
- } else {
- // TODO: Should we do this for a negative offset?
- const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits);
- ImmField = COffsetVal & OffsetMask;
- }
- uint64_t RemainderOffset = COffsetVal - ImmField;
+ uint64_t RemainderOffset = COffsetVal - ImmField;
- assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned));
- assert(RemainderOffset + ImmField == COffsetVal);
+ assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned));
+ assert(RemainderOffset + ImmField == COffsetVal);
- OffsetVal = ImmField;
+ OffsetVal = ImmField;
- // TODO: Should this try to use a scalar add pseudo if the base address is
- // uniform and saddr is usable?
- SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
- SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+ // TODO: Should this try to use a scalar add pseudo if the base address
+ // is uniform and saddr is usable?
+ SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+ SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
- SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
- DL, MVT::i32, N0, Sub0);
- SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
- DL, MVT::i32, N0, Sub1);
+ SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+ MVT::i32, N0, Sub0);
+ SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+ MVT::i32, N0, Sub1);
- SDValue AddOffsetLo
- = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
- SDValue AddOffsetHi
- = getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
+ SDValue AddOffsetLo =
+ getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+ SDValue AddOffsetHi =
+ getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
- SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
- SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+ SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
+ SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
- SDNode *Add = CurDAG->getMachineNode(
- AMDGPU::V_ADD_I32_e64, DL, VTs,
- {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
+ SDNode *Add =
+ CurDAG->getMachineNode(AMDGPU::V_ADD_I32_e64, DL, VTs,
+ {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
- SDNode *Addc = CurDAG->getMachineNode(
- AMDGPU::V_ADDC_U32_e64, DL, VTs,
- {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
+ SDNode *Addc = CurDAG->getMachineNode(
+ AMDGPU::V_ADDC_U32_e64, DL, VTs,
+ {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
- SDValue RegSequenceArgs[] = {
- CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
- SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1
- };
+ SDValue RegSequenceArgs[] = {
+ CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
+ SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
- Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
- MVT::i64, RegSequenceArgs), 0);
+ Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::i64, RegSequenceArgs),
+ 0);
+ }
}
}
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
// wraparound, because s_load instructions perform the addition in 64 bits.
if ((Addr.getValueType() != MVT::i32 ||
- Addr->getFlags().hasNoUnsignedWrap()) &&
- (CurDAG->isBaseWithConstantOffset(Addr) ||
- Addr.getOpcode() == ISD::ADD)) {
- SDValue N0 = Addr.getOperand(0);
- SDValue N1 = Addr.getOperand(1);
-
- if (SelectSMRDOffset(N1, Offset, Imm)) {
- SBase = Expand32BitAddress(N0);
- return true;
+ Addr->getFlags().hasNoUnsignedWrap())) {
+ SDValue N0, N1;
+ // Extract the base and offset if possible.
+ if (CurDAG->isBaseWithConstantOffset(Addr) ||
+ Addr.getOpcode() == ISD::ADD) {
+ N0 = Addr.getOperand(0);
+ N1 = Addr.getOperand(1);
+ } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
+ assert(N0 && N1 && isa<ConstantSDNode>(N1));
+ }
+ if (N0 && N1) {
+ if (SelectSMRDOffset(N1, Offset, Imm)) {
+ SBase = Expand32BitAddress(N0);
+ return true;
+ }
}
}
SBase = Expand32BitAddress(Addr);
define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 {
; HAWAII-LABEL: local_store_i55:
; HAWAII: ; %bb.0:
-; HAWAII-NEXT: s_add_u32 s0, s4, 14
-; HAWAII-NEXT: s_addc_u32 s1, s5, 0
+; HAWAII-NEXT: s_or_b32 s0, s4, 14
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
-; HAWAII-NEXT: v_mov_b32_e32 v1, s1
+; HAWAII-NEXT: v_mov_b32_e32 v1, s5
; HAWAII-NEXT: flat_load_ubyte v0, v[0:1]
; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0
; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2
;
; FIJI-LABEL: local_store_i55:
; FIJI: ; %bb.0:
+; FIJI-NEXT: s_or_b32 s0, s4, 14
+; FIJI-NEXT: v_mov_b32_e32 v0, s0
+; FIJI-NEXT: v_mov_b32_e32 v1, s5
+; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0
-; FIJI-NEXT: s_load_dword s2, s[4:5], 0x8
-; FIJI-NEXT: s_load_dword s1, s[4:5], 0xc
+; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8
+; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc
; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
-; FIJI-NEXT: v_mov_b32_e32 v2, s0
-; FIJI-NEXT: s_and_b32 s3, s1, 0xffff
-; FIJI-NEXT: s_add_u32 s0, s4, 14
+; FIJI-NEXT: v_mov_b32_e32 v1, s0
; FIJI-NEXT: v_mov_b32_e32 v3, s1
-; FIJI-NEXT: s_addc_u32 s1, s5, 0
-; FIJI-NEXT: v_mov_b32_e32 v0, s0
-; FIJI-NEXT: v_mov_b32_e32 v1, s1
-; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
-; FIJI-NEXT: ds_write_b16 v2, v3 offset:4
-; FIJI-NEXT: v_mov_b32_e32 v3, s2
-; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
+; FIJI-NEXT: s_and_b32 s3, s2, 0xffff
+; FIJI-NEXT: v_mov_b32_e32 v2, s2
+; FIJI-NEXT: ds_write_b16 v1, v2 offset:4
+; FIJI-NEXT: s_waitcnt vmcnt(0)
; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; FIJI-NEXT: v_or_b32_e32 v0, s3, v0
; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7
-; FIJI-NEXT: ds_write_b8 v2, v0 offset:6
-; FIJI-NEXT: ds_write_b32 v2, v3
+; FIJI-NEXT: ds_write_b8 v1, v0 offset:6
+; FIJI-NEXT: ds_write_b32 v1, v3
; FIJI-NEXT: s_endpgm
;
; GFX9-LABEL: local_store_i55: