ProfileSummaryInfo *PSI = nullptr;
BlockFrequencyInfo *BFI = nullptr;
+ /// List of non-single value types.
+ FoldingSet<SDVTListNode> VTListMap;
+
+ /// Pool allocation for misc. objects that are created once per SelectionDAG.
+ BumpPtrAllocator Allocator;
+
/// The starting token.
SDNode EntryNode;
BumpPtrAllocator OperandAllocator;
ArrayRecycler<SDUse> OperandRecycler;
- /// Pool allocation for misc. objects that are created once per SelectionDAG.
- BumpPtrAllocator Allocator;
-
/// Tracks dbg_value and dbg_label information through SDISel.
SDDbgInfo *DbgInfo;
SDNode *FindNodeOrInsertPos(const FoldingSetNodeID &ID, const SDLoc &DL,
void *&InsertPos);
- /// List of non-single value types.
- FoldingSet<SDVTListNode> VTListMap;
-
/// Maps to auto-CSE operations.
std::vector<CondCodeSDNode*> CondCodeNodes;
#endif
llvm_unreachable("This target-independent node should have been selected!");
case ISD::EntryToken:
- llvm_unreachable("EntryToken should have been excluded from the schedule!");
case ISD::MERGE_VALUES:
case ISD::TokenFactor: // fall thru
break;
// EntryNode could meaningfully have debug info if we can find it...
SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
: TM(tm), OptLevel(OL),
- EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
+ EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other, MVT::Glue)),
Root(getEntryNode()) {
InsertNode(&EntryNode);
DbgInfo = new SDDbgInfo();
(void)Res;
}
+ SMEAttrs Attrs(MF.getFunction());
+ bool IsLocallyStreaming =
+ !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
+ assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
+ SDValue Glue = Chain.getValue(1);
+
+ SmallVector<SDValue, 16> ArgValues;
unsigned ExtraArgLocs = 0;
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
// Transform the arguments in physical registers into virtual ones.
Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
- ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
+
+ if (IsLocallyStreaming) {
+ // LocallyStreamingFunctions must insert the SMSTART in the correct
+ // position, so we use Glue to ensure no instructions can be scheduled
+ // between the chain of:
+ // t0: ch,glue = EntryNode
+ // t1: res,ch,glue = CopyFromReg
+ // ...
+ // tn: res,ch,glue = CopyFromReg t(n-1), ..
+ // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
+ // ^^^^^^
+ // This will be the new Chain/Root node.
+ ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
+ Glue = ArgValue.getValue(2);
+ } else
+ ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
// If this is an 8, 16 or 32-bit value, it is really passed promoted
// to 64 bits. Insert an assert[sz]ext to capture this, then
}
assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
+ // Insert the SMSTART if this is a locally streaming function and
+ // make sure it is Glued to the last CopyFromReg value.
+ if (IsLocallyStreaming) {
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ Chain = DAG.getNode(
+ AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+ {DAG.getRoot(),
+ DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32),
+ DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64),
+ DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()), Glue});
+ // Ensure that the SMSTART happens after the CopyWithChain such that its
+ // chain result is used.
+ for (unsigned I=0; I<InVals.size(); ++I) {
+ Register Reg = MF.getRegInfo().createVirtualRegister(
+ getRegClassFor(InVals[I].getValueType().getSimpleVT()));
+ SDValue X = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
+ InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
+ InVals[I].getValueType());
+ }
+ }
+
// varargs
if (isVarArg) {
if (!Subtarget->isTargetDarwin() || IsWin64) {
}
}
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+
+ // Emit SMSTOP before returning from a locally streaming function
+ SMEAttrs FuncAttrs(MF.getFunction());
+ if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
+ Chain = DAG.getNode(
+ AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
+ DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32),
+ DAG.getConstant(1, DL, MVT::i64), DAG.getConstant(0, DL, MVT::i64),
+ DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()));
+ Flag = Chain.getValue(1);
+ }
+
SmallVector<SDValue, 4> RetOps(1, Chain);
for (auto &RetVal : RetVals) {
Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
}
- const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
if (I) {
for (; *I; ++I) {
break;
case AArch64::ADDVL_XXI:
case AArch64::ADDPL_XXI:
+ case AArch64::ADDSVL_XXI:
+ case AArch64::ADDSPL_XXI:
MaxEncoding = 31;
ShiftSize = 0;
if (Offset < 0) {
// `Offset` can be in bytes or in "scalable bytes".
int VScale = 1;
- if (Opc == AArch64::ADDVL_XXI)
+ if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
VScale = 16;
- else if (Opc == AArch64::ADDPL_XXI)
+ else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
VScale = 2;
// FIXME: If the offset won't fit in 24-bits, compute the offset into a
bool NeedsWinCFI, bool *HasWinCFI,
bool EmitCFAOffset, StackOffset CFAOffset,
unsigned FrameReg) {
+ // If a function is marked as arm_locally_streaming, then the runtime value of
+ // vscale in the prologue/epilogue is different the runtime value of vscale
+ // in the function's body. To avoid having to consider multiple vscales,
+ // we can use `addsvl` to allocate any scalable stack-slots, which under
+ // most circumstances will be only locals, not callee-save slots.
+ const Function &F = MBB.getParent()->getFunction();
+ bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
+
int64_t Bytes, NumPredicateVectors, NumDataVectors;
AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
Offset, Bytes, NumPredicateVectors, NumDataVectors);
if (NumDataVectors) {
emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
- AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr,
- EmitCFAOffset, CFAOffset, FrameReg);
+ UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI,
+ TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
+ CFAOffset, FrameReg);
CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
SrcReg = DestReg;
}
if (NumPredicateVectors) {
assert(DestReg != AArch64::SP && "Unaligned access to SP");
emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
- AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr,
- EmitCFAOffset, CFAOffset, FrameReg);
+ UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI,
+ TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
+ CFAOffset, FrameReg);
}
}
define i64 @get_pstatesm_locally_streaming() nounwind "aarch64_pstate_sm_body" {
; CHECK-LABEL: get_pstatesm_locally_streaming:
; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: mov w0, #1
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
; CHECK-NEXT: ret
%pstate = call i64 @llvm.aarch64.sme.get.pstatesm()
ret i64 %pstate
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
+
+declare void @normal_callee();
+declare void @streaming_callee() "aarch64_pstate_sm_enabled";
+declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible";
+
+define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: locally_streaming_caller_streaming_callee:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: bl streaming_compatible_callee
+; CHECK-NEXT: bl streaming_compatible_callee
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+
+ call void @streaming_compatible_callee();
+ call void @streaming_compatible_callee();
+ ret void;
+}
+
+; Test that a streaming body and streaming interface, no smstart/smstop are emitted,
+; because the function already is in streaming mode upon entry.
+define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_enabled" "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: streaming_and_locally_streaming_caller_streaming_callee:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: bl streaming_callee
+; CHECK-NEXT: bl streaming_callee
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @streaming_callee();
+ call void @streaming_callee();
+ ret void;
+}
+
+define void @locally_streaming_multiple_exit(i64 %cond) "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: locally_streaming_multiple_exit:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: cmp x0, #1
+; CHECK-NEXT: b.ne .LBB2_2
+; CHECK-NEXT: // %bb.1: // %if.else
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB2_2: // %if.end
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+
+entry:
+ %tobool = icmp eq i64 %cond, 1
+ br i1 %tobool, label %if.else, label %if.end
+
+if.else:
+ ret void;
+
+if.end:
+ ret void;
+}
+
+; Do a fixed-width vector add on a NEON vector.
+; This tests that:
+; * Incoming vector in v0.d isn't clobbered by the change in streaming mode.
+; * Result vector is correctly preserved after smstop.
+define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: locally_streaming_caller_no_callee:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #80
+; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: adrp x8, .LCPI3_0
+; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT: add v0.2d, v1.2d, v0.2d
+; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #80
+; CHECK-NEXT: ret
+
+ %add = add <2 x i64> %a, <i64 41, i64 42>;
+ ret <2 x i64> %add;
+}
+
+; Test that we use the interface (not the function's body) to determine what
+; streaming-mode to enter the callee. In this case the interface is normal, so
+; pstate.sm must be 0 on entry and is 0 upon return from the callee.
+define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: locally_streaming_caller_locally_streaming_callee:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl locally_streaming_caller_streaming_callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+
+ call void @locally_streaming_caller_streaming_callee();
+ ret void;
+}
+
+;
+; Test that a locally streaming function correctly retains the
+; argument/result registers, because smstart/smstop instructions that are
+; inserted to implement the arm_locally_streaming attribute thrashes the
+; vector register contents.
+;
+
+define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: locally_streaming_caller_compatible_callee_vec_args_ret:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: bl streaming_compatible_callee_vec_args_ret
+; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: ret
+ %res = call <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_compatible"
+ ret <2 x i64> %res;
+}
+
+declare <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64>) "aarch64_pstate_sm_compatible"
+
+define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct_arg_ret({<2 x i64>, <2 x i64>} %arg) "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: locally_streaming_caller_compatible_callee_struct_arg_ret:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: bl streaming_compatible_callee_vec_arg_struct_ret
+; CHECK-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #112
+; CHECK-NEXT: ret
+ %v1.arg = extractvalue {<2 x i64>, <2 x i64>} %arg, 1
+ %res = call {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64> %v1.arg) "aarch64_pstate_sm_compatible"
+ ret {<2 x i64>, <2 x i64>} %res;
+}
+
+declare {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64>) "aarch64_pstate_sm_compatible"
+
+; Test that we use `addsvl` for allocating any stack space for locals before `smstart`,
+; such that the correct amount of stack space is allocated.
+define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body" {
+; CHECK-LABEL: locally_streaming_caller_alloca:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: addsvl sp, sp, #-1
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: mov x0, sp
+; CHECK-NEXT: bl use_ptr
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: addsvl sp, sp, #1
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %alloca = alloca <vscale x 4 x i32>
+ call void @use_ptr(ptr %alloca) "aarch64_pstate_sm_compatible"
+ ret void
+}
+
+declare void @use_ptr(ptr) "aarch64_pstate_sm_compatible"
+
+define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_body" {
+; CHECK-LABEL: call_to_intrinsic_without_chain:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: str d0, [sp, #72] // 8-byte Folded Spill
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldr d0, [sp, #72] // 8-byte Folded Reload
+; CHECK-NEXT: bl cos
+; CHECK-NEXT: str d0, [sp, #72] // 8-byte Folded Spill
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr d0, [sp, #72] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = call fast double @llvm.cos.f64(double %x)
+ ret double %0
+}
+
+declare double @llvm.cos.f64(double)
define void @foo(<8 x i64>* %a) #0 {
; CHECK-LABEL: foo:
; CHECK: SelectionDAG has 14 nodes:
-; CHECK-NEXT: t0: ch = EntryToken
+; CHECK-NEXT: t0: ch,glue = EntryToken
; CHECK-NEXT: t12: nxv2i1 = PTRUE_D TargetConstant:i32<31>
; CHECK-NEXT: t2: i64,ch = CopyFromReg t0, Register:i64 %0
; CHECK-NEXT: t18: nxv2i64,ch = LD1D_IMM<Mem:(volatile load (s512) from %ir.a)> t12, t2, TargetConstant:i64<0>, t0
; GCN: Initial selection DAG: %bb.0 'test_sdag_dump:entry'
; GCN: SelectionDAG has 10 nodes:
-; GCN-DEFAULT: t0: ch = EntryToken
+; GCN-DEFAULT: t0: ch,glue = EntryToken
; GCN-DEFAULT: t2: f32,ch = CopyFromReg t0, Register:f32 %0
; GCN-DEFAULT: t5: f32 = fadd t2, t2
; GCN-DEFAULT: t4: f32,ch = CopyFromReg # D:1 t0, Register:f32 %1
; GCN-DEFAULT: t8: ch,glue = CopyToReg # D:1 t0, Register:f32 $vgpr0, t6
; GCN-DEFAULT: t9: ch = RETURN_TO_EPILOG # D:1 t8, Register:f32 $vgpr0, t8:1
-; GCN-VERBOSE: t0: ch = EntryToken # D:0
+; GCN-VERBOSE: t0: ch,glue = EntryToken # D:0
; GCN-VERBOSE: t2: f32,ch = CopyFromReg [ORD=1] # D:0 t0, Register:f32 %0 # D:0
; GCN-VERBOSE: t5: f32 = fadd [ORD=2] # D:0 t2, t2
; GCN-VERBOSE: t4: f32,ch = CopyFromReg [ORD=1] # D:1 t0, Register:f32 %1 # D:0
; inlineasm_br. Not sure how to get a MachineIR change so this reads the debug
; output from SelectionDAG.
-; CHECK: t0: ch = EntryToken
+; CHECK: t0: ch,glue = EntryToken
; CHECK-NEXT: t4: i32,ch = CopyFromReg t0, Register:i32 %3
; CHECK-NEXT: t10: i32 = add t4, Constant:i32<1>
; CHECK-NEXT: t12: ch = CopyToReg t0, Register:i32 %0, t10
; X86-NEXT: retq
; DBGDAG-LABEL: Optimized legalized selection DAG: %bb.0 'merge_store_partial_overlap_load:'
-; DBGDAG: [[ENTRYTOKEN:t[0-9]+]]: ch = EntryToken
+; DBGDAG: [[ENTRYTOKEN:t[0-9]+]]: ch,glue = EntryToken
; DBGDAG-DAG: [[BASEPTR:t[0-9]+]]: i64,ch = CopyFromReg [[ENTRYTOKEN]],
; DBGDAG-DAG: [[ADDPTR:t[0-9]+]]: i64 = add {{(nuw )?}}[[BASEPTR]], Constant:i64<2>
define i64 @i64_test(i64 %i) nounwind readnone {
; CHECK-LABEL: i64_test:
; CHECK: SelectionDAG has 9 nodes:
-; CHECK-NEXT: t0: ch = EntryToken
+; CHECK-NEXT: t0: ch,glue = EntryToken
; CHECK-NEXT: t11: ch,glue = CopyToReg t0, Register:i32 $vgpr0, IMPLICIT_DEF:i32
; CHECK-NEXT: t17: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
; CHECK-NEXT: t13: ch,glue = CopyToReg t11, Register:i32 $vgpr1, t17, t11:1
; CHECK-LABEL: i32_test:
; CHECK: SelectionDAG has 8 nodes:
; CHECK-NEXT: t5: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
-; CHECK-NEXT: t0: ch = EntryToken
+; CHECK-NEXT: t0: ch,glue = EntryToken
; CHECK-NEXT: t7: ch,glue = CopyToReg t0, Register:i32 $vgpr0, t5
; CHECK-NEXT: t9: ch,glue = CopyToReg t7, Register:i32 $vgpr1, t5, t7:1
; CHECK-NEXT: t10: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t9, t9:1
; CHECK-LABEL: i16_test:
; CHECK: SelectionDAG has 8 nodes:
; CHECK-NEXT: t5: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
-; CHECK-NEXT: t0: ch = EntryToken
+; CHECK-NEXT: t0: ch,glue = EntryToken
; CHECK-NEXT: t7: ch,glue = CopyToReg t0, Register:i32 $vgpr0, t5
; CHECK-NEXT: t9: ch,glue = CopyToReg t7, Register:i32 $vgpr1, t5, t7:1
; CHECK-NEXT: t10: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t9, t9:1
; CHECK-LABEL: i8_test:
; CHECK: SelectionDAG has 8 nodes:
; CHECK-NEXT: t5: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
-; CHECK-NEXT: t0: ch = EntryToken
+; CHECK-NEXT: t0: ch,glue = EntryToken
; CHECK-NEXT: t7: ch,glue = CopyToReg t0, Register:i32 $vgpr0, t5
; CHECK-NEXT: t9: ch,glue = CopyToReg t7, Register:i32 $vgpr1, t5, t7:1
; CHECK-NEXT: t10: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t9, t9:1
define i64 @i64_test(i64 %i) nounwind readnone {
; CHECK-LABEL: i64_test:
; CHECK: SelectionDAG has 22 nodes:
-; CHECK-NEXT: t0: ch = EntryToken
+; CHECK-NEXT: t0: ch,glue = EntryToken
; CHECK-NEXT: t5: i32,ch = LDW_RI<Mem:(load (s32) from %fixed-stack.0)> TargetFrameIndex:i32<-2>, TargetConstant:i32<0>, TargetConstant:i32<0>, t0
; CHECK-NEXT: t7: i32 = ADD_I_LO TargetFrameIndex:i32<0>, TargetConstant:i32<0>
; CHECK-NEXT: t29: i32 = OR_I_LO t7, TargetConstant:i32<4>
define i64 @i32_test(i32 %i) nounwind readnone {
; CHECK-LABEL: i32_test:
; CHECK: SelectionDAG has 14 nodes:
-; CHECK-NEXT: t0: ch = EntryToken
+; CHECK-NEXT: t0: ch,glue = EntryToken
; CHECK-NEXT: t21: i32,ch = CopyFromReg t0, Register:i32 $r0
; CHECK-NEXT: t13: ch,glue = CopyToReg t0, Register:i32 $rv, t21
; CHECK-NEXT: t3: i32,ch = LDW_RI<Mem:(load (s32) from %fixed-stack.0, align 8)> TargetFrameIndex:i32<-1>, TargetConstant:i32<0>, TargetConstant:i32<0>, t0
define i64 @i16_test(i16 %i) nounwind readnone {
; CHECK-LABEL: i16_test:
; CHECK: SelectionDAG has 19 nodes:
-; CHECK-NEXT: t0: ch = EntryToken
+; CHECK-NEXT: t0: ch,glue = EntryToken
; CHECK-NEXT: t33: i32,ch = CopyFromReg t0, Register:i32 $r0
; CHECK-NEXT: t14: ch,glue = CopyToReg t0, Register:i32 $rv, t33
; CHECK-NEXT: t1: i32 = ADD_I_LO TargetFrameIndex:i32<-1>, TargetConstant:i32<0>
define i64 @i8_test(i8 %i) nounwind readnone {
; CHECK-LABEL: i8_test:
; CHECK: SelectionDAG has 20 nodes:
-; CHECK-NEXT: t0: ch = EntryToken
+; CHECK-NEXT: t0: ch,glue = EntryToken
; CHECK-NEXT: t33: i32,ch = CopyFromReg t0, Register:i32 $r0
; CHECK-NEXT: t14: ch,glue = CopyToReg t0, Register:i32 $rv, t33
; CHECK-NEXT: t1: i32 = ADD_I_LO TargetFrameIndex:i32<-1>, TargetConstant:i32<0>
define i64 @i64_test(i64 %i) nounwind readnone {
; PIC-LABEL: i64_test:
; PIC: SelectionDAG has 12 nodes:
-; PIC-NEXT: t0: ch = EntryToken
+; PIC-NEXT: t0: ch,glue = EntryToken
; PIC-NEXT: t2: i64,ch = CopyFromReg t0, Register:i64 %0
; PIC-NEXT: t7: i64,i32,ch = ADD64rm<Mem:(dereferenceable load (s64) from %ir.loc)> t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0
; PIC-NEXT: t10: ch,glue = CopyToReg t0, Register:i64 $rax, t7
;
; WIN-LABEL: i64_test:
; WIN: SelectionDAG has 12 nodes:
-; WIN-NEXT: t0: ch = EntryToken
+; WIN-NEXT: t0: ch,glue = EntryToken
; WIN-NEXT: t2: i64,ch = CopyFromReg t0, Register:i64 %0
; WIN-NEXT: t7: i64,i32,ch = ADD64rm<Mem:(dereferenceable load (s64) from %ir.loc)> t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0
; WIN-NEXT: t10: ch,glue = CopyToReg t0, Register:i64 $rax, t7
define i64 @i32_test(i32 %i) nounwind readnone {
; PIC-LABEL: i32_test:
; PIC: SelectionDAG has 15 nodes:
-; PIC-NEXT: t0: ch = EntryToken
+; PIC-NEXT: t0: ch,glue = EntryToken
; PIC-NEXT: t2: i32,ch = CopyFromReg t0, Register:i32 %0
; PIC-NEXT: t7: i32,i32,ch = ADD32rm<Mem:(dereferenceable load (s32) from %ir.loc)> t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0
; PIC-NEXT: t8: i64 = SUBREG_TO_REG TargetConstant:i64<0>, t7, TargetConstant:i32<6>
;
; WIN-LABEL: i32_test:
; WIN: SelectionDAG has 15 nodes:
-; WIN-NEXT: t0: ch = EntryToken
+; WIN-NEXT: t0: ch,glue = EntryToken
; WIN-NEXT: t2: i32,ch = CopyFromReg t0, Register:i32 %0
; WIN-NEXT: t7: i32,i32,ch = ADD32rm<Mem:(dereferenceable load (s32) from %ir.loc)> t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0
; WIN-NEXT: t8: i64 = SUBREG_TO_REG TargetConstant:i64<0>, t7, TargetConstant:i32<6>
define i64 @i16_test(i16 %i) nounwind readnone {
; PIC-LABEL: i16_test:
; PIC: SelectionDAG has 18 nodes:
-; PIC-NEXT: t0: ch = EntryToken
+; PIC-NEXT: t0: ch,glue = EntryToken
; PIC-NEXT: t2: i32,ch = CopyFromReg t0, Register:i32 %0
; PIC-NEXT: t3: i16 = EXTRACT_SUBREG t2, TargetConstant:i32<4>
; PIC-NEXT: t8: i16,i32,ch = ADD16rm<Mem:(dereferenceable load (s16) from %ir.loc)> t3, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0
;
; WIN-LABEL: i16_test:
; WIN: SelectionDAG has 16 nodes:
-; WIN-NEXT: t0: ch = EntryToken
+; WIN-NEXT: t0: ch,glue = EntryToken
; WIN-NEXT: t2: i16,ch = CopyFromReg t0, Register:i16 %0
; WIN-NEXT: t7: i16,i32,ch = ADD16rm<Mem:(dereferenceable load (s16) from %ir.loc)> t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0
; WIN-NEXT: t14: i32 = MOVZX32rr16 t7
define i64 @i8_test(i8 %i) nounwind readnone {
; PIC-LABEL: i8_test:
; PIC: SelectionDAG has 18 nodes:
-; PIC-NEXT: t0: ch = EntryToken
+; PIC-NEXT: t0: ch,glue = EntryToken
; PIC-NEXT: t2: i32,ch = CopyFromReg t0, Register:i32 %0
; PIC-NEXT: t3: i8 = EXTRACT_SUBREG t2, TargetConstant:i32<1>
; PIC-NEXT: t8: i8,i32,ch = ADD8rm<Mem:(dereferenceable load (s8) from %ir.loc)> t3, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0
;
; WIN-LABEL: i8_test:
; WIN: SelectionDAG has 16 nodes:
-; WIN-NEXT: t0: ch = EntryToken
+; WIN-NEXT: t0: ch,glue = EntryToken
; WIN-NEXT: t2: i8,ch = CopyFromReg t0, Register:i8 %0
; WIN-NEXT: t7: i8,i32,ch = ADD8rm<Mem:(dereferenceable load (s8) from %ir.loc)> t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0
; WIN-NEXT: t14: i32 = MOVZX32rr8 t7