return CI->isTailCall();
}
-bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
- SDValue &Offset,
- ISD::MemIndexedMode &AM,
- bool &IsInc,
- SelectionDAG &DAG) const {
+bool AArch64TargetLowering::getIndexedAddressParts(
+ SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
+ ISD::MemIndexedMode &AM, bool &IsInc, SelectionDAG &DAG) const {
if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
return false;
+ // Non-null if there is exactly one user of the loaded value (ignoring chain).
+ SDNode *ValOnlyUser = nullptr;
+ for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
+ ++UI) {
+ if (UI.getUse().getResNo() == 1)
+ continue; // Ignore chain.
+ if (ValOnlyUser == nullptr)
+ ValOnlyUser = *UI;
+ else {
+ ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
+ break;
+ }
+ }
+
+ auto IsUndefOrZero = [](SDValue V) {
+ return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
+ };
+
+ // If the only user of the value is a scalable vector splat, it is
+ // preferable to do a replicating load (ld1r*).
+ if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
+ (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
+ (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
+ IsUndefOrZero(ValOnlyUser->getOperand(2)))))
+ return false;
+
Base = Op->getOperand(0);
// All of the indexed addressing mode instructions take a signed
// 9 bit immediate offset.
return false;
bool IsInc;
- if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
+ if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
return false;
AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
return true;
return false;
bool IsInc;
- if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
+ if (!getIndexedAddressParts(N, Op, Base, Offset, AM, IsInc, DAG))
return false;
// Post-indexing updates the base, so it's not a valid transform
// if that's not the same as the load's pointer.
ret <vscale x 2 x double> %res
}
+
+; Check that a load consumed by a scalable splat prefers a replicating load.
+define i8* @avoid_preindex_load(i8* %src, <vscale x 2 x i64>* %out) {
+; CHECK-LABEL: avoid_preindex_load:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, x0, #1
+; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1]
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i8, i8* %src, i64 1
+ %tmp = load i8, i8* %ptr, align 4
+ %ext = sext i8 %tmp to i64
+ %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
+ %dup = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+ store <vscale x 2 x i64> %dup, <vscale x 2 x i64>* %out
+ ret i8* %ptr
+}
+
+; Check that a load consumed by a scalable splat prefers a replicating
+; load over a pre-indexed load.
+define i8* @avoid_preindex_load_dup(i8* %src, <vscale x 2 x i1> %pg, <vscale x 2 x i64>* %out) {
+; CHECK-LABEL: avoid_preindex_load_dup:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add x8, x0, #1
+; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1]
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i8, i8* %src, i64 1
+ %tmp = load i8, i8* %ptr, align 4
+ %ext = sext i8 %tmp to i64
+ %dup = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
+ store <vscale x 2 x i64> %dup, <vscale x 2 x i64>* %out
+ ret i8* %ptr
+}
+
+; Same as avoid_preindex_load_dup, but with zero passthru.
+define i8* @avoid_preindex_load_dup_passthru_zero(i8* %src, <vscale x 2 x i1> %pg, <vscale x 2 x i64>* %out) {
+; CHECK-LABEL: avoid_preindex_load_dup_passthru_zero:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add x8, x0, #1
+; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1]
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i8, i8* %src, i64 1
+ %tmp = load i8, i8* %ptr, align 4
+ %ext = sext i8 %tmp to i64
+ %dup = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, i64 %ext)
+ store <vscale x 2 x i64> %dup, <vscale x 2 x i64>* %out
+ ret i8* %ptr
+}
+
+; If a dup has a non-undef passthru, stick with the pre-indexed load.
+define i8* @preindex_load_dup_passthru(<vscale x 2 x i64> %passthru, i8* %src, <vscale x 2 x i1> %pg, <vscale x 2 x i64>* %out) {
+; CHECK-LABEL: preindex_load_dup_passthru:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrsb x8, [x0, #1]!
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: mov z0.d, p0/m, x8
+; CHECK-NEXT: st1d { z0.d }, p1, [x1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i8, i8* %src, i64 1
+ %tmp = load i8, i8* %ptr, align 4
+ %ext = sext i8 %tmp to i64
+ %dup = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> %passthru, <vscale x 2 x i1> %pg, i64 %ext)
+ store <vscale x 2 x i64> %dup, <vscale x 2 x i64>* %out
+ ret i8* %ptr
+}
+
+; Show that a second user of the load prevents the replicating load
+; check which would ordinarily inhibit indexed loads from firing.
+define i8* @preidx8sext64_instead_of_ld1r(i8* %src, <vscale x 2 x i64>* %out, i64* %dst) {
+; CHECK-LABEL: preidx8sext64_instead_of_ld1r:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrsb x8, [x0, #1]!
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: mov z0.d, x8
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: str x8, [x2]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i8, i8* %src, i64 1
+ %tmp = load i8, i8* %ptr, align 4
+ %ext = sext i8 %tmp to i64
+ %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
+ %dup = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+ store <vscale x 2 x i64> %dup, <vscale x 2 x i64>* %out
+ store i64 %ext, i64* %dst
+ ret i8* %ptr
+}
+
+
declare <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8>, i64)
declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16>, i64)
declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)