CurDAG->RemoveDeadNodes();
}
+// Returns true if N is a MachineSDNode that has a reg and simm12 memory
+// operand. The indices of the base pointer and offset are returned in BaseOpIdx
+// and OffsetOpIdx.
+static bool hasMemOffset(SDNode *N, unsigned &BaseOpIdx,
+ unsigned &OffsetOpIdx) {
+ switch (N->getMachineOpcode()) {
+ case RISCV::LB:
+ case RISCV::LH:
+ case RISCV::LW:
+ case RISCV::LBU:
+ case RISCV::LHU:
+ case RISCV::LWU:
+ case RISCV::LD:
+ case RISCV::FLH:
+ case RISCV::FLW:
+ case RISCV::FLD:
+ BaseOpIdx = 0;
+ OffsetOpIdx = 1;
+ return true;
+ case RISCV::SB:
+ case RISCV::SH:
+ case RISCV::SW:
+ case RISCV::SD:
+ case RISCV::FSH:
+ case RISCV::FSW:
+ case RISCV::FSD:
+ BaseOpIdx = 1;
+ OffsetOpIdx = 2;
+ return true;
+ }
+
+ return false;
+}
+
static SDNode *selectImmWithConstantPool(SelectionDAG *CurDAG, const SDLoc &DL,
const MVT VT, int64_t Imm,
const RISCVSubtarget &Subtarget) {
ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ADDI, DL, VT, TFI, Imm));
return;
}
+ case ISD::ADD: {
+ // Try to select ADD + immediate used as memory addresses to
+ // (ADDI (ADD X, Imm-Lo12), Lo12) if it will allow the ADDI to be removed by
+ // doPeepholeLoadStoreADDI.
+
+ // LHS should be an immediate.
+ auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
+ if (!N1C)
+ break;
+
+ int64_t Offset = N1C->getSExtValue();
+ int64_t Lo12 = SignExtend64<12>(Offset);
+
+ // Don't do this if the lower 12 bits are 0 or we could use ADDI directly.
+ if (Lo12 == 0 || isInt<12>(Offset))
+ break;
+
+ // Don't do this if we can use a pair of ADDIs.
+ if (isInt<12>(Offset / 2) && isInt<12>(Offset - Offset / 2))
+ break;
+
+ bool AllPointerUses = true;
+ for (auto UI = Node->use_begin(), UE = Node->use_end(); UI != UE; ++UI) {
+ SDNode *User = *UI;
+
+ // Is this user a memory instruction that uses a register and immediate
+ // that has this ADD as its pointer.
+ unsigned BaseOpIdx, OffsetOpIdx;
+ if (!User->isMachineOpcode() ||
+ !hasMemOffset(User, BaseOpIdx, OffsetOpIdx) ||
+ UI.getOperandNo() != BaseOpIdx) {
+ AllPointerUses = false;
+ break;
+ }
+
+ // If the memory instruction already has an offset, make sure the combined
+ // offset is foldable.
+ int64_t MemOffs =
+ cast<ConstantSDNode>(User->getOperand(OffsetOpIdx))->getSExtValue();
+ MemOffs += Lo12;
+ if (!isInt<12>(MemOffs)) {
+ AllPointerUses = false;
+ break;
+ }
+ }
+
+ if (!AllPointerUses)
+ break;
+
+ Offset -= Lo12;
+ // Restore sign bits for RV32.
+ if (!Subtarget->is64Bit())
+ Offset = SignExtend64<32>(Offset);
+
+ // Emit (ADDI (ADD X, Hi), Lo)
+ SDNode *Imm = selectImm(CurDAG, DL, VT, Offset, *Subtarget);
+ SDNode *ADD = CurDAG->getMachineNode(RISCV::ADD, DL, VT,
+ Node->getOperand(0), SDValue(Imm, 0));
+ SDNode *ADDI =
+ CurDAG->getMachineNode(RISCV::ADDI, DL, VT, SDValue(ADD, 0),
+ CurDAG->getTargetConstant(Lo12, DL, VT));
+ ReplaceNode(Node, ADDI);
+ return;
+ }
case ISD::SRL: {
// Optimize (srl (and X, C2), C) ->
// (srli (slli X, (XLen-C3), (XLen-C3) + C)
// -> (store val, (add base, src), off1+off2)
// This is possible when off1+off2 fits a 12-bit immediate.
bool RISCVDAGToDAGISel::doPeepholeLoadStoreADDI(SDNode *N) {
- int OffsetOpIdx;
- int BaseOpIdx;
-
- // Only attempt this optimisation for I-type loads and S-type stores.
- switch (N->getMachineOpcode()) {
- default:
+ unsigned OffsetOpIdx, BaseOpIdx;
+ if (!hasMemOffset(N, BaseOpIdx, OffsetOpIdx))
return false;
- case RISCV::LB:
- case RISCV::LH:
- case RISCV::LW:
- case RISCV::LBU:
- case RISCV::LHU:
- case RISCV::LWU:
- case RISCV::LD:
- case RISCV::FLH:
- case RISCV::FLW:
- case RISCV::FLD:
- BaseOpIdx = 0;
- OffsetOpIdx = 1;
- break;
- case RISCV::SB:
- case RISCV::SH:
- case RISCV::SW:
- case RISCV::SD:
- case RISCV::FSH:
- case RISCV::FSW:
- case RISCV::FSD:
- BaseOpIdx = 1;
- OffsetOpIdx = 2;
- break;
- }
if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)))
return false;
if (!Base.isMachineOpcode())
return false;
- // There is a ADD between ADDI and load/store. We can only fold ADDI that
- // do not have a FrameIndex operand.
- SDValue Add;
- unsigned AddBaseIdx;
- if (Base.getMachineOpcode() == RISCV::ADD && Base.hasOneUse()) {
- Add = Base;
- SDValue Op0 = Base.getOperand(0);
- SDValue Op1 = Base.getOperand(1);
- if (Op0.isMachineOpcode() && Op0.getMachineOpcode() == RISCV::ADDI &&
- !isa<FrameIndexSDNode>(Op0.getOperand(0)) &&
- isa<ConstantSDNode>(Op0.getOperand(1))) {
- AddBaseIdx = 1;
- Base = Op0;
- } else if (Op1.isMachineOpcode() && Op1.getMachineOpcode() == RISCV::ADDI &&
- !isa<FrameIndexSDNode>(Op1.getOperand(0)) &&
- isa<ConstantSDNode>(Op1.getOperand(1))) {
- AddBaseIdx = 0;
- Base = Op1;
- } else if (Op1.isMachineOpcode() &&
- Op1.getMachineOpcode() == RISCV::ADDIW &&
- isa<ConstantSDNode>(Op1.getOperand(1)) &&
- Op1.getOperand(0).isMachineOpcode() &&
- Op1.getOperand(0).getMachineOpcode() == RISCV::LUI) {
- // We found an LUI+ADDIW constant materialization. We might be able to
- // fold the ADDIW offset if it could be treated as ADDI.
- // Emulate the constant materialization to see if the result would be
- // a simm32 if ADDI was used instead of ADDIW.
-
- // First the LUI.
- uint64_t Imm = Op1.getOperand(0).getConstantOperandVal(0);
- Imm <<= 12;
- Imm = SignExtend64<32>(Imm);
-
- // Then the ADDI.
- uint64_t LoImm = cast<ConstantSDNode>(Op1.getOperand(1))->getSExtValue();
- Imm += LoImm;
-
- // If the result isn't a simm32, we can't do the optimization.
- if (!isInt<32>(Imm))
- return false;
-
- AddBaseIdx = 0;
- Base = Op1;
- } else
- return false;
- } else if (Base.getMachineOpcode() == RISCV::ADDI) {
- // If the base is an ADDI, we can merge it in to the load/store.
- } else
+ // If the base is an ADDI, we can merge it in to the load/store.
+ if (Base.getMachineOpcode() != RISCV::ADDI)
return false;
SDValue ImmOperand = Base.getOperand(1);
LLVM_DEBUG(N->dump(CurDAG));
LLVM_DEBUG(dbgs() << "\n");
- if (Add)
- Add = SDValue(CurDAG->UpdateNodeOperands(Add.getNode(),
- Add.getOperand(AddBaseIdx),
- Base.getOperand(0)),
- 0);
-
// Modify the offset operand of the load/store.
if (BaseOpIdx == 0) { // Load
- if (Add)
- N = CurDAG->UpdateNodeOperands(N, Add, ImmOperand, N->getOperand(2));
- else
- N = CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
- N->getOperand(2));
+ N = CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
+ N->getOperand(2));
} else { // Store
- if (Add)
- N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Add, ImmOperand,
- N->getOperand(3));
- else
- N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
- ImmOperand, N->getOperand(3));
+ N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
+ ImmOperand, N->getOperand(3));
}
return true;
; RV32I-LABEL: lw_sw_far_local:
; RV32I: # %bb.0:
; RV32I-NEXT: lui a2, 4
-; RV32I-NEXT: addi a2, a2, -4
; RV32I-NEXT: add a2, a0, a2
-; RV32I-NEXT: lw a0, 0(a2)
-; RV32I-NEXT: sw a1, 0(a2)
+; RV32I-NEXT: lw a0, -4(a2)
+; RV32I-NEXT: sw a1, -4(a2)
; RV32I-NEXT: ret
%1 = getelementptr inbounds i32, i32* %a, i64 4095
%2 = load volatile i32, i32* %1
; RV32I-LABEL: lw_sw_really_far_local:
; RV32I: # %bb.0:
; RV32I-NEXT: lui a2, 524288
-; RV32I-NEXT: addi a2, a2, -2048
; RV32I-NEXT: add a2, a0, a2
-; RV32I-NEXT: lw a0, 0(a2)
-; RV32I-NEXT: sw a1, 0(a2)
+; RV32I-NEXT: lw a0, -2048(a2)
+; RV32I-NEXT: sw a1, -2048(a2)
; RV32I-NEXT: ret
%1 = getelementptr inbounds i32, i32* %a, i32 536870400
%2 = load volatile i32, i32* %1
; RV64I-LABEL: lw_sw_far_local:
; RV64I: # %bb.0:
; RV64I-NEXT: lui a2, 8
-; RV64I-NEXT: addiw a2, a2, -8
; RV64I-NEXT: add a2, a0, a2
-; RV64I-NEXT: ld a0, 0(a2)
-; RV64I-NEXT: sd a1, 0(a2)
+; RV64I-NEXT: ld a0, -8(a2)
+; RV64I-NEXT: sd a1, -8(a2)
; RV64I-NEXT: ret
%1 = getelementptr inbounds i64, i64* %a, i64 4095
%2 = load volatile i64, i64* %1
define i64 @lw_really_far_local(i64* %a) {
; RV64I-LABEL: lw_really_far_local:
; RV64I: # %bb.0:
-; RV64I-NEXT: lui a1, 524288
-; RV64I-NEXT: addiw a1, a1, -2048
+; RV64I-NEXT: li a1, 1
+; RV64I-NEXT: slli a1, a1, 31
; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: ld a0, 0(a0)
+; RV64I-NEXT: ld a0, -2048(a0)
; RV64I-NEXT: ret
%1 = getelementptr inbounds i64, i64* %a, i64 268435200
%2 = load volatile i64, i64* %1
define void @st_really_far_local(i64* %a, i64 %b) {
; RV64I-LABEL: st_really_far_local:
; RV64I: # %bb.0:
-; RV64I-NEXT: lui a2, 524288
-; RV64I-NEXT: addiw a2, a2, -2048
+; RV64I-NEXT: li a2, 1
+; RV64I-NEXT: slli a2, a2, 31
; RV64I-NEXT: add a0, a0, a2
-; RV64I-NEXT: sd a1, 0(a0)
+; RV64I-NEXT: sd a1, -2048(a0)
; RV64I-NEXT: ret
%1 = getelementptr inbounds i64, i64* %a, i64 268435200
store i64 %b, i64* %1
define i64 @lw_sw_really_far_local(i64* %a, i64 %b) {
; RV64I-LABEL: lw_sw_really_far_local:
; RV64I: # %bb.0:
-; RV64I-NEXT: lui a2, 524288
-; RV64I-NEXT: addiw a2, a2, -2048
+; RV64I-NEXT: li a2, 1
+; RV64I-NEXT: slli a2, a2, 31
; RV64I-NEXT: add a2, a0, a2
-; RV64I-NEXT: ld a0, 0(a2)
-; RV64I-NEXT: sd a1, 0(a2)
+; RV64I-NEXT: ld a0, -2048(a2)
+; RV64I-NEXT: sd a1, -2048(a2)
; RV64I-NEXT: ret
%1 = getelementptr inbounds i64, i64* %a, i64 268435200
%2 = load volatile i64, i64* %1
; RV32I: # %bb.0: # %entry
; RV32I-NEXT: lw a0, 0(a0)
; RV32I-NEXT: lui a2, 20
-; RV32I-NEXT: addi a2, a2, -1920
; RV32I-NEXT: add a1, a1, a2
; RV32I-NEXT: add a0, a0, a2
; RV32I-NEXT: li a2, 2
-; RV32I-NEXT: sw a2, 0(a0)
+; RV32I-NEXT: sw a2, -1920(a0)
; RV32I-NEXT: li a3, 1
-; RV32I-NEXT: sw a3, 4(a0)
-; RV32I-NEXT: sw a3, 0(a1)
-; RV32I-NEXT: sw a2, 4(a1)
+; RV32I-NEXT: sw a3, -1916(a0)
+; RV32I-NEXT: sw a3, -1920(a1)
+; RV32I-NEXT: sw a2, -1916(a1)
; RV32I-NEXT: ret
;
; RV64I-LABEL: test1:
; RV64I: # %bb.0: # %entry
; RV64I-NEXT: ld a0, 0(a0)
; RV64I-NEXT: lui a2, 20
-; RV64I-NEXT: addiw a2, a2, -1920
; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: li a2, 2
-; RV64I-NEXT: sw a2, 0(a0)
+; RV64I-NEXT: sw a2, -1920(a0)
; RV64I-NEXT: li a3, 1
-; RV64I-NEXT: sw a3, 4(a0)
-; RV64I-NEXT: sw a3, 0(a1)
-; RV64I-NEXT: sw a2, 4(a1)
+; RV64I-NEXT: sw a3, -1916(a0)
+; RV64I-NEXT: sw a3, -1920(a1)
+; RV64I-NEXT: sw a2, -1916(a1)
; RV64I-NEXT: ret
entry:
%s = load [65536 x i32]*, [65536 x i32]** %sp
}
; GEPs have been manually split so the base GEP does not get used by any memory
-; instructions. Make sure we use a small offset in each of the stores.
+; instructions. Make sure we use an offset and common base for each of the
+; stores.
define void @test3([65536 x i32]* %t) {
; RV32I-LABEL: test3:
; RV32I: # %bb.0: # %entry
; RV32I-NEXT: lui a1, 20
-; RV32I-NEXT: addi a1, a1, -1920
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: li a1, 2
-; RV32I-NEXT: sw a1, 4(a0)
+; RV32I-NEXT: sw a1, -1916(a0)
; RV32I-NEXT: li a1, 3
-; RV32I-NEXT: sw a1, 8(a0)
+; RV32I-NEXT: sw a1, -1912(a0)
; RV32I-NEXT: ret
;
; RV64I-LABEL: test3:
; RV64I: # %bb.0: # %entry
; RV64I-NEXT: lui a1, 20
-; RV64I-NEXT: addiw a1, a1, -1920
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: li a1, 2
-; RV64I-NEXT: sw a1, 4(a0)
+; RV64I-NEXT: sw a1, -1916(a0)
; RV64I-NEXT: li a1, 3
-; RV64I-NEXT: sw a1, 8(a0)
+; RV64I-NEXT: sw a1, -1912(a0)
; RV64I-NEXT: ret
entry:
%0 = bitcast [65536 x i32]* %t to i8*