return isUnscaledLdSt(MI->getOpcode());
}
+// Is this a candidate for ld/st merging or pairing? For example, we don't
+// touch volatiles or load/stores that have a hint to avoid pair formation.
+bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr *MI) const {
+ // If this is a volatile load/store, don't mess with it.
+ if (MI->hasOrderedMemoryRef())
+ return false;
+
+ // Make sure this is a reg+imm (as opposed to an address reloc).
+ assert(MI->getOperand(1).isReg() && "Expected a reg operand.");
+ if (!MI->getOperand(2).isImm())
+ return false;
+
+ // Can't merge/pair if the instruction modifies the base register.
+ // e.g., ldr x0, [x0]
+ unsigned BaseReg = MI->getOperand(1).getReg();
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ if (MI->modifiesRegister(BaseReg, TRI))
+ return false;
+
+ // Check if this load/store has a hint to avoid pair formation.
+ // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
+ if (isLdStPairSuppressed(MI))
+ return false;
+
+ return true;
+}
+
bool AArch64InstrInfo::getMemOpBaseRegImmOfs(
MachineInstr *LdSt, unsigned &BaseReg, int64_t &Offset,
const TargetRegisterInfo *TRI) const {
case AArch64::LDRQui:
case AArch64::LDRXui:
case AArch64::LDRWui:
+ case AArch64::LDRSWui:
+ // Unscaled instructions.
+ case AArch64::LDURSi:
+ case AArch64::LDURDi:
+ case AArch64::LDURQi:
+ case AArch64::LDURWi:
+ case AArch64::LDURXi:
+ case AArch64::LDURSWi:
unsigned Width;
return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI);
};
break;
case AArch64::LDRWui:
case AArch64::LDRSui:
+ case AArch64::LDRSWui:
case AArch64::STRWui:
case AArch64::STRSui:
Scale = Width = 4;
return true;
}
+// Scale the unscaled offsets. Returns false if the unscaled offset can't be
+// scaled.
+static bool scaleOffset(unsigned Opc, int64_t &Offset) {
+ unsigned OffsetStride = 1;
+ switch (Opc) {
+ default:
+ return false;
+ case AArch64::LDURQi:
+ OffsetStride = 16;
+ break;
+ case AArch64::LDURXi:
+ case AArch64::LDURDi:
+ OffsetStride = 8;
+ break;
+ case AArch64::LDURWi:
+ case AArch64::LDURSi:
+ case AArch64::LDURSWi:
+ OffsetStride = 4;
+ break;
+ }
+ // If the byte-offset isn't a multiple of the stride, we can't scale this
+ // offset.
+ if (Offset % OffsetStride != 0)
+ return false;
+
+ // Convert the byte-offset used by unscaled into an "element" offset used
+ // by the scaled pair load/store instructions.
+ Offset /= OffsetStride;
+ return true;
+}
+
+static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
+ if (FirstOpc == SecondOpc)
+ return true;
+ // We can also pair sign-ext and zero-ext instructions.
+ switch (FirstOpc) {
+ default:
+ return false;
+ case AArch64::LDRWui:
+ case AArch64::LDURWi:
+ return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
+ case AArch64::LDRSWui:
+ case AArch64::LDURSWi:
+ return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
+ }
+ // These instructions can't be paired based on their opcodes.
+ return false;
+}
+
/// Detect opportunities for ldp/stp formation.
///
/// Only called for LdSt for which getMemOpBaseRegImmOfs returns true.
// Only cluster up to a single pair.
if (NumLoads > 1)
return false;
- if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
+
+ // Can we pair these instructions based on their opcodes?
+ unsigned FirstOpc = FirstLdSt->getOpcode();
+ unsigned SecondOpc = SecondLdSt->getOpcode();
+ if (!canPairLdStOpc(FirstOpc, SecondOpc))
+ return false;
+
+ // Can't merge volatiles or load/stores that have a hint to avoid pair
+ // formation, for example.
+ if (!isCandidateToMergeOrPair(FirstLdSt) ||
+ !isCandidateToMergeOrPair(SecondLdSt))
+ return false;
+
+ // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
+ int64_t Offset1 = FirstLdSt->getOperand(2).getImm();
+ if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
return false;
- // getMemOpBaseRegImmOfs guarantees that oper 2 isImm.
- unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
- // Allow 6 bits of positive range.
- if (Ofs1 > 64)
+
+ int64_t Offset2 = SecondLdSt->getOperand(2).getImm();
+ if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
return false;
+
+ // Pairwise instructions have a 7-bit signed offset field.
+ if (Offset1 > 63 || Offset1 < -64)
+ return false;
+
// The caller should already have ordered First/SecondLdSt by offset.
- unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
- return Ofs1 + 1 == Ofs2;
+ assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
+ return Offset1 + 1 == Offset2;
}
bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
/// Return true if this is an unscaled load/store.
bool isUnscaledLdSt(MachineInstr *MI) const;
+ /// Return true if this is a load/store that can be potentially paired/merged.
+ bool isCandidateToMergeOrPair(MachineInstr *MI) const;
+
/// Hint that pairing the given load or store is unprofitable.
void suppressLdStPair(MachineInstr *MI) const;
mergeUpdateInsn(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Update, bool IsPreIdx);
- // Is this a candidate for ld/st merging or pairing? For example, we don't
- // touch volatiles or load/stores that have a hint to avoid pair formation.
- bool isCandidateToMergeOrPair(MachineInstr *MI);
-
// Find and merge foldable ldr/str instructions.
bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
return false;
}
-bool AArch64LoadStoreOpt::isCandidateToMergeOrPair(MachineInstr *MI) {
- // If this is a volatile load/store, don't mess with it.
- if (MI->hasOrderedMemoryRef())
- return false;
-
- // Make sure this is a reg+imm (as opposed to an address reloc).
- if (!getLdStOffsetOp(MI).isImm())
- return false;
-
- // Can't merge/pair if the instruction modifies the base register.
- // e.g., ldr x0, [x0]
- unsigned BaseReg = getLdStBaseOp(MI).getReg();
- if (MI->modifiesRegister(BaseReg, TRI))
- return false;
-
- // Check if this load/store has a hint to avoid pair formation.
- // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
- if (TII->isLdStPairSuppressed(MI))
- return false;
-
- return true;
-}
-
// Find narrow loads that can be converted into a single wider load with
// bitfield extract instructions. Also merge adjacent zero stores into a wider
// store.
MachineInstr *MI = MBBI;
MachineBasicBlock::iterator E = MI->getParent()->end();
- if (!isCandidateToMergeOrPair(MI))
+ if (!TII->isCandidateToMergeOrPair(MI))
return false;
// For promotable zero stores, the stored value should be WZR.
MachineInstr *MI = MBBI;
MachineBasicBlock::iterator E = MI->getParent()->end();
- if (!isCandidateToMergeOrPair(MI))
+ if (!TII->isCandidateToMergeOrPair(MI))
return false;
// Early exit if the offset is not possible to match. (6 bits of positive
--- /dev/null
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+
+; Test ldr clustering.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldr_int:BB#0
+; CHECK: Cluster loads SU(1) - SU(2)
+; CHECK: SU(1): %vreg{{[0-9]+}}<def> = LDRWui
+; CHECK: SU(2): %vreg{{[0-9]+}}<def> = LDRWui
+define i32 @ldr_int(i32* %a) nounwind {
+ %p1 = getelementptr inbounds i32, i32* %a, i32 1
+ %tmp1 = load i32, i32* %p1, align 2
+ %p2 = getelementptr inbounds i32, i32* %a, i32 2
+ %tmp2 = load i32, i32* %p2, align 2
+ %tmp3 = add i32 %tmp1, %tmp2
+ ret i32 %tmp3
+}
+
+; Test ldpsw clustering
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldp_sext_int:BB#0
+; CHECK: Cluster loads SU(1) - SU(2)
+; CHECK: SU(1): %vreg{{[0-9]+}}<def> = LDRSWui
+; CHECK: SU(2): %vreg{{[0-9]+}}<def> = LDRSWui
+define i64 @ldp_sext_int(i32* %p) nounwind {
+ %tmp = load i32, i32* %p, align 4
+ %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
+ %tmp1 = load i32, i32* %add.ptr, align 4
+ %sexttmp = sext i32 %tmp to i64
+ %sexttmp1 = sext i32 %tmp1 to i64
+ %add = add nsw i64 %sexttmp1, %sexttmp
+ ret i64 %add
+}
+
+; Test ldur clustering.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldur_int:BB#0
+; CHECK: Cluster loads SU(2) - SU(1)
+; CHECK: SU(1): %vreg{{[0-9]+}}<def> = LDURWi
+; CHECK: SU(2): %vreg{{[0-9]+}}<def> = LDURWi
+define i32 @ldur_int(i32* %a) nounwind {
+ %p1 = getelementptr inbounds i32, i32* %a, i32 -1
+ %tmp1 = load i32, i32* %p1, align 2
+ %p2 = getelementptr inbounds i32, i32* %a, i32 -2
+ %tmp2 = load i32, i32* %p2, align 2
+ %tmp3 = add i32 %tmp1, %tmp2
+ ret i32 %tmp3
+}
+
+; Test sext + zext clustering.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldp_half_sext_zext_int:BB#0
+; CHECK: Cluster loads SU(3) - SU(4)
+; CHECK: SU(3): %vreg{{[0-9]+}}<def> = LDRSWui
+; CHECK: SU(4): %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
+define i64 @ldp_half_sext_zext_int(i64* %q, i32* %p) nounwind {
+ %tmp0 = load i64, i64* %q, align 4
+ %tmp = load i32, i32* %p, align 4
+ %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
+ %tmp1 = load i32, i32* %add.ptr, align 4
+ %sexttmp = sext i32 %tmp to i64
+ %sexttmp1 = zext i32 %tmp1 to i64
+ %add = add nsw i64 %sexttmp1, %sexttmp
+ %add1 = add nsw i64 %add, %tmp0
+ ret i64 %add1
+}
+
+; Test zext + sext clustering.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldp_half_zext_sext_int:BB#0
+; CHECK: Cluster loads SU(3) - SU(4)
+; CHECK: SU(3): %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
+; CHECK: SU(4): %vreg{{[0-9]+}}<def> = LDRSWui
+define i64 @ldp_half_zext_sext_int(i64* %q, i32* %p) nounwind {
+ %tmp0 = load i64, i64* %q, align 4
+ %tmp = load i32, i32* %p, align 4
+ %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
+ %tmp1 = load i32, i32* %add.ptr, align 4
+ %sexttmp = zext i32 %tmp to i64
+ %sexttmp1 = sext i32 %tmp1 to i64
+ %add = add nsw i64 %sexttmp1, %sexttmp
+ %add1 = add nsw i64 %add, %tmp0
+ ret i64 %add1
+}
+
+; Verify we don't cluster volatile loads.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldr_int_volatile:BB#0
+; CHECK-NOT: Cluster loads
+; CHECK: SU(1): %vreg{{[0-9]+}}<def> = LDRWui
+; CHECK: SU(2): %vreg{{[0-9]+}}<def> = LDRWui
+define i32 @ldr_int_volatile(i32* %a) nounwind {
+ %p1 = getelementptr inbounds i32, i32* %a, i32 1
+ %tmp1 = load volatile i32, i32* %p1, align 2
+ %p2 = getelementptr inbounds i32, i32* %a, i32 2
+ %tmp2 = load volatile i32, i32* %p2, align 2
+ %tmp3 = add i32 %tmp1, %tmp2
+ ret i32 %tmp3
+}