From 02df03c5b7ae9fa2e8b55369dd5ebd3871a60017 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 14 Oct 2022 08:51:13 +0000 Subject: [PATCH] [AArch64][SME] Add support for arm_locally_streaming functions. Functions with `aarch64_sme_pstatesm_body` will emit a SMSTART at the start of the function, and a SMSTOP at the end of the function, such that all operations use the right value for vscale. Because the placement of these nodes is critically important (i.e. no vscale-dependent operations should be done before SMSTART has been issued), we require glueing the CopyFromReg to the Entry node such that we can insert the SMSTART as part of that glued chain. More details about the SME attributes and design can be found in D131562. Reviewed By: aemerson Differential Revision: https://reviews.llvm.org/D131582 --- llvm/include/llvm/CodeGen/SelectionDAG.h | 12 +- llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 1 - llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 59 ++++- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 24 +- llvm/test/CodeGen/AArch64/sme-get-pstatesm.ll | 10 + llvm/test/CodeGen/AArch64/sme-streaming-body.ll | 265 +++++++++++++++++++++ .../AArch64/sve-fixed-length-frame-offests.ll | 2 +- llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll | 4 +- llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll | 2 +- .../X86/merge-store-partially-alias-loads.ll | 2 +- .../Inputs/amdgpu_isel.ll.expected | 8 +- .../Inputs/lanai_isel.ll.expected | 8 +- .../Inputs/x86_isel.ll.expected | 16 +- 14 files changed, 378 insertions(+), 37 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sme-streaming-body.ll diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 5c83ead..7259e62 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -237,6 +237,12 @@ class SelectionDAG { ProfileSummaryInfo *PSI = nullptr; BlockFrequencyInfo *BFI = nullptr; + /// List of non-single value types. + FoldingSet VTListMap; + + /// Pool allocation for misc. objects that are created once per SelectionDAG. + BumpPtrAllocator Allocator; + /// The starting token. SDNode EntryNode; @@ -263,9 +269,6 @@ class SelectionDAG { BumpPtrAllocator OperandAllocator; ArrayRecycler OperandRecycler; - /// Pool allocation for misc. objects that are created once per SelectionDAG. - BumpPtrAllocator Allocator; - /// Tracks dbg_value and dbg_label information through SDISel. SDDbgInfo *DbgInfo; @@ -2281,9 +2284,6 @@ private: SDNode *FindNodeOrInsertPos(const FoldingSetNodeID &ID, const SDLoc &DL, void *&InsertPos); - /// List of non-single value types. - FoldingSet VTListMap; - /// Maps to auto-CSE operations. std::vector CondCodeNodes; diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 7d9bd0a..9d225ad4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -1162,7 +1162,6 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned, #endif llvm_unreachable("This target-independent node should have been selected!"); case ISD::EntryToken: - llvm_unreachable("EntryToken should have been excluded from the schedule!"); case ISD::MERGE_VALUES: case ISD::TokenFactor: // fall thru break; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 93a74b7..9f01fa5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1275,7 +1275,7 @@ Align SelectionDAG::getEVTAlign(EVT VT) const { // EntryNode could meaningfully have debug info if we can find it... SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL) : TM(tm), OptLevel(OL), - EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)), + EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other, MVT::Glue)), Root(getEntryNode()) { InsertNode(&EntryNode); DbgInfo = new SDDbgInfo(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 949dc47..3d12cf5 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6037,6 +6037,13 @@ SDValue AArch64TargetLowering::LowerFormalArguments( (void)Res; } + SMEAttrs Attrs(MF.getFunction()); + bool IsLocallyStreaming = + !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody(); + assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value"); + SDValue Glue = Chain.getValue(1); + + SmallVector ArgValues; unsigned ExtraArgLocs = 0; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; @@ -6091,7 +6098,22 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // Transform the arguments in physical registers into virtual ones. Register Reg = MF.addLiveIn(VA.getLocReg(), RC); - ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); + + if (IsLocallyStreaming) { + // LocallyStreamingFunctions must insert the SMSTART in the correct + // position, so we use Glue to ensure no instructions can be scheduled + // between the chain of: + // t0: ch,glue = EntryNode + // t1: res,ch,glue = CopyFromReg + // ... + // tn: res,ch,glue = CopyFromReg t(n-1), .. + // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2 + // ^^^^^^ + // This will be the new Chain/Root node. + ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue); + Glue = ArgValue.getValue(2); + } else + ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); // If this is an 8, 16 or 32-bit value, it is really passed promoted // to 64 bits. Insert an assert[sz]ext to capture this, then @@ -6245,6 +6267,27 @@ SDValue AArch64TargetLowering::LowerFormalArguments( } assert((ArgLocs.size() + ExtraArgLocs) == Ins.size()); + // Insert the SMSTART if this is a locally streaming function and + // make sure it is Glued to the last CopyFromReg value. + if (IsLocallyStreaming) { + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + Chain = DAG.getNode( + AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), + {DAG.getRoot(), + DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64), + DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()), Glue}); + // Ensure that the SMSTART happens after the CopyWithChain such that its + // chain result is used. + for (unsigned I=0; IisTargetDarwin() || IsWin64) { @@ -7485,6 +7528,19 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, } } + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + + // Emit SMSTOP before returning from a locally streaming function + SMEAttrs FuncAttrs(MF.getFunction()); + if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) { + Chain = DAG.getNode( + AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, + DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32), + DAG.getConstant(1, DL, MVT::i64), DAG.getConstant(0, DL, MVT::i64), + DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask())); + Flag = Chain.getValue(1); + } + SmallVector RetOps(1, Chain); for (auto &RetVal : RetVals) { Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag); @@ -7509,7 +7565,6 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); } - const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF); if (I) { for (; *I; ++I) { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index f4e807f..adc03b7 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -4256,6 +4256,8 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, break; case AArch64::ADDVL_XXI: case AArch64::ADDPL_XXI: + case AArch64::ADDSVL_XXI: + case AArch64::ADDSPL_XXI: MaxEncoding = 31; ShiftSize = 0; if (Offset < 0) { @@ -4270,9 +4272,9 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, // `Offset` can be in bytes or in "scalable bytes". int VScale = 1; - if (Opc == AArch64::ADDVL_XXI) + if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI) VScale = 16; - else if (Opc == AArch64::ADDPL_XXI) + else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI) VScale = 2; // FIXME: If the offset won't fit in 24-bits, compute the offset into a @@ -4369,6 +4371,14 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFAOffset, StackOffset CFAOffset, unsigned FrameReg) { + // If a function is marked as arm_locally_streaming, then the runtime value of + // vscale in the prologue/epilogue is different the runtime value of vscale + // in the function's body. To avoid having to consider multiple vscales, + // we can use `addsvl` to allocate any scalable stack-slots, which under + // most circumstances will be only locals, not callee-save slots. + const Function &F = MBB.getParent()->getFunction(); + bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body"); + int64_t Bytes, NumPredicateVectors, NumDataVectors; AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( Offset, Bytes, NumPredicateVectors, NumDataVectors); @@ -4399,8 +4409,9 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, if (NumDataVectors) { emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, - AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr, - EmitCFAOffset, CFAOffset, FrameReg); + UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, + TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, + CFAOffset, FrameReg); CFAOffset += StackOffset::getScalable(-NumDataVectors * 16); SrcReg = DestReg; } @@ -4408,8 +4419,9 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, if (NumPredicateVectors) { assert(DestReg != AArch64::SP && "Unaligned access to SP"); emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, - AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr, - EmitCFAOffset, CFAOffset, FrameReg); + UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, + TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, + CFAOffset, FrameReg); } } diff --git a/llvm/test/CodeGen/AArch64/sme-get-pstatesm.ll b/llvm/test/CodeGen/AArch64/sme-get-pstatesm.ll index a20abc8..d92290f 100644 --- a/llvm/test/CodeGen/AArch64/sme-get-pstatesm.ll +++ b/llvm/test/CodeGen/AArch64/sme-get-pstatesm.ll @@ -22,7 +22,17 @@ define i64 @get_pstatesm_streaming() nounwind "aarch64_pstate_sm_enabled" { define i64 @get_pstatesm_locally_streaming() nounwind "aarch64_pstate_sm_body" { ; CHECK-LABEL: get_pstatesm_locally_streaming: ; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload ; CHECK-NEXT: ret %pstate = call i64 @llvm.aarch64.sme.get.pstatesm() ret i64 %pstate diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll new file mode 100644 index 0000000..450f42d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll @@ -0,0 +1,265 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s + +declare void @normal_callee(); +declare void @streaming_callee() "aarch64_pstate_sm_enabled"; +declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"; + +define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: locally_streaming_caller_streaming_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: bl streaming_compatible_callee +; CHECK-NEXT: bl streaming_compatible_callee +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + + call void @streaming_compatible_callee(); + call void @streaming_compatible_callee(); + ret void; +} + +; Test that a streaming body and streaming interface, no smstart/smstop are emitted, +; because the function already is in streaming mode upon entry. +define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_enabled" "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: streaming_and_locally_streaming_caller_streaming_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @streaming_callee(); + call void @streaming_callee(); + ret void; +} + +define void @locally_streaming_multiple_exit(i64 %cond) "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: locally_streaming_multiple_exit: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: cmp x0, #1 +; CHECK-NEXT: b.ne .LBB2_2 +; CHECK-NEXT: // %bb.1: // %if.else +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB2_2: // %if.end +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: ret + +entry: + %tobool = icmp eq i64 %cond, 1 + br i1 %tobool, label %if.else, label %if.end + +if.else: + ret void; + +if.end: + ret void; +} + +; Do a fixed-width vector add on a NEON vector. +; This tests that: +; * Incoming vector in v0.d isn't clobbered by the change in streaming mode. +; * Result vector is correctly preserved after smstop. +define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: locally_streaming_caller_no_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 +; CHECK-NEXT: ret + + %add = add <2 x i64> %a, ; + ret <2 x i64> %add; +} + +; Test that we use the interface (not the function's body) to determine what +; streaming-mode to enter the callee. In this case the interface is normal, so +; pstate.sm must be 0 on entry and is 0 upon return from the callee. +define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: locally_streaming_caller_locally_streaming_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl locally_streaming_caller_streaming_callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + + call void @locally_streaming_caller_streaming_callee(); + ret void; +} + +; +; Test that a locally streaming function correctly retains the +; argument/result registers, because smstart/smstop instructions that are +; inserted to implement the arm_locally_streaming attribute thrashes the +; vector register contents. +; + +define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: locally_streaming_caller_compatible_callee_vec_args_ret: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl streaming_compatible_callee_vec_args_ret +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_compatible" + ret <2 x i64> %res; +} + +declare <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64>) "aarch64_pstate_sm_compatible" + +define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct_arg_ret({<2 x i64>, <2 x i64>} %arg) "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: locally_streaming_caller_compatible_callee_struct_arg_ret: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: bl streaming_compatible_callee_vec_arg_struct_ret +; CHECK-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #112 +; CHECK-NEXT: ret + %v1.arg = extractvalue {<2 x i64>, <2 x i64>} %arg, 1 + %res = call {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64> %v1.arg) "aarch64_pstate_sm_compatible" + ret {<2 x i64>, <2 x i64>} %res; +} + +declare {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64>) "aarch64_pstate_sm_compatible" + +; Test that we use `addsvl` for allocating any stack space for locals before `smstart`, +; such that the correct amount of stack space is allocated. +define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body" { +; CHECK-LABEL: locally_streaming_caller_alloca: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: addsvl sp, sp, #-1 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: bl use_ptr +; CHECK-NEXT: smstop sm +; CHECK-NEXT: addsvl sp, sp, #1 +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloca = alloca + call void @use_ptr(ptr %alloca) "aarch64_pstate_sm_compatible" + ret void +} + +declare void @use_ptr(ptr) "aarch64_pstate_sm_compatible" + +define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_body" { +; CHECK-LABEL: call_to_intrinsic_without_chain: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: str d0, [sp, #72] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #72] // 8-byte Folded Reload +; CHECK-NEXT: bl cos +; CHECK-NEXT: str d0, [sp, #72] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr d0, [sp, #72] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call fast double @llvm.cos.f64(double %x) + ret double %0 +} + +declare double @llvm.cos.f64(double) diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll index 98ae39c..62681cd 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll @@ -10,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu" define void @foo(<8 x i64>* %a) #0 { ; CHECK-LABEL: foo: ; CHECK: SelectionDAG has 14 nodes: -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t12: nxv2i1 = PTRUE_D TargetConstant:i32<31> ; CHECK-NEXT: t2: i64,ch = CopyFromReg t0, Register:i64 %0 ; CHECK-NEXT: t18: nxv2i64,ch = LD1D_IMM t12, t2, TargetConstant:i64<0>, t0 diff --git a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll index 38d6533..25c93bd 100644 --- a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll +++ b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll @@ -7,7 +7,7 @@ ; GCN: Initial selection DAG: %bb.0 'test_sdag_dump:entry' ; GCN: SelectionDAG has 10 nodes: -; GCN-DEFAULT: t0: ch = EntryToken +; GCN-DEFAULT: t0: ch,glue = EntryToken ; GCN-DEFAULT: t2: f32,ch = CopyFromReg t0, Register:f32 %0 ; GCN-DEFAULT: t5: f32 = fadd t2, t2 ; GCN-DEFAULT: t4: f32,ch = CopyFromReg # D:1 t0, Register:f32 %1 @@ -15,7 +15,7 @@ ; GCN-DEFAULT: t8: ch,glue = CopyToReg # D:1 t0, Register:f32 $vgpr0, t6 ; GCN-DEFAULT: t9: ch = RETURN_TO_EPILOG # D:1 t8, Register:f32 $vgpr0, t8:1 -; GCN-VERBOSE: t0: ch = EntryToken # D:0 +; GCN-VERBOSE: t0: ch,glue = EntryToken # D:0 ; GCN-VERBOSE: t2: f32,ch = CopyFromReg [ORD=1] # D:0 t0, Register:f32 %0 # D:0 ; GCN-VERBOSE: t5: f32 = fadd [ORD=2] # D:0 t2, t2 ; GCN-VERBOSE: t4: f32,ch = CopyFromReg [ORD=1] # D:1 t0, Register:f32 %1 # D:0 diff --git a/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll b/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll index 4e8bd2f..876bf78 100644 --- a/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll +++ b/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll @@ -5,7 +5,7 @@ ; inlineasm_br. Not sure how to get a MachineIR change so this reads the debug ; output from SelectionDAG. -; CHECK: t0: ch = EntryToken +; CHECK: t0: ch,glue = EntryToken ; CHECK-NEXT: t4: i32,ch = CopyFromReg t0, Register:i32 %3 ; CHECK-NEXT: t10: i32 = add t4, Constant:i32<1> ; CHECK-NEXT: t12: ch = CopyToReg t0, Register:i32 %0, t10 diff --git a/llvm/test/CodeGen/X86/merge-store-partially-alias-loads.ll b/llvm/test/CodeGen/X86/merge-store-partially-alias-loads.ll index d48c59a..4d0d05e 100644 --- a/llvm/test/CodeGen/X86/merge-store-partially-alias-loads.ll +++ b/llvm/test/CodeGen/X86/merge-store-partially-alias-loads.ll @@ -14,7 +14,7 @@ ; X86-NEXT: retq ; DBGDAG-LABEL: Optimized legalized selection DAG: %bb.0 'merge_store_partial_overlap_load:' -; DBGDAG: [[ENTRYTOKEN:t[0-9]+]]: ch = EntryToken +; DBGDAG: [[ENTRYTOKEN:t[0-9]+]]: ch,glue = EntryToken ; DBGDAG-DAG: [[BASEPTR:t[0-9]+]]: i64,ch = CopyFromReg [[ENTRYTOKEN]], ; DBGDAG-DAG: [[ADDPTR:t[0-9]+]]: i64 = add {{(nuw )?}}[[BASEPTR]], Constant:i64<2> diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected index 0cc5ede..38c2040 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected @@ -4,7 +4,7 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-LABEL: i64_test: ; CHECK: SelectionDAG has 9 nodes: -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t11: ch,glue = CopyToReg t0, Register:i32 $vgpr0, IMPLICIT_DEF:i32 ; CHECK-NEXT: t17: i32 = V_MOV_B32_e32 TargetConstant:i32<0> ; CHECK-NEXT: t13: ch,glue = CopyToReg t11, Register:i32 $vgpr1, t17, t11:1 @@ -20,7 +20,7 @@ define i64 @i32_test(i32 %i) nounwind readnone { ; CHECK-LABEL: i32_test: ; CHECK: SelectionDAG has 8 nodes: ; CHECK-NEXT: t5: i32 = V_MOV_B32_e32 TargetConstant:i32<0> -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t7: ch,glue = CopyToReg t0, Register:i32 $vgpr0, t5 ; CHECK-NEXT: t9: ch,glue = CopyToReg t7, Register:i32 $vgpr1, t5, t7:1 ; CHECK-NEXT: t10: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t9, t9:1 @@ -36,7 +36,7 @@ define i64 @i16_test(i16 %i) nounwind readnone { ; CHECK-LABEL: i16_test: ; CHECK: SelectionDAG has 8 nodes: ; CHECK-NEXT: t5: i32 = V_MOV_B32_e32 TargetConstant:i32<0> -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t7: ch,glue = CopyToReg t0, Register:i32 $vgpr0, t5 ; CHECK-NEXT: t9: ch,glue = CopyToReg t7, Register:i32 $vgpr1, t5, t7:1 ; CHECK-NEXT: t10: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t9, t9:1 @@ -52,7 +52,7 @@ define i64 @i8_test(i8 %i) nounwind readnone { ; CHECK-LABEL: i8_test: ; CHECK: SelectionDAG has 8 nodes: ; CHECK-NEXT: t5: i32 = V_MOV_B32_e32 TargetConstant:i32<0> -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t7: ch,glue = CopyToReg t0, Register:i32 $vgpr0, t5 ; CHECK-NEXT: t9: ch,glue = CopyToReg t7, Register:i32 $vgpr1, t5, t7:1 ; CHECK-NEXT: t10: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t9, t9:1 diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected index 62aa8da..7d152d9 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected @@ -4,7 +4,7 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-LABEL: i64_test: ; CHECK: SelectionDAG has 22 nodes: -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t5: i32,ch = LDW_RI TargetFrameIndex:i32<-2>, TargetConstant:i32<0>, TargetConstant:i32<0>, t0 ; CHECK-NEXT: t7: i32 = ADD_I_LO TargetFrameIndex:i32<0>, TargetConstant:i32<0> ; CHECK-NEXT: t29: i32 = OR_I_LO t7, TargetConstant:i32<4> @@ -29,7 +29,7 @@ define i64 @i64_test(i64 %i) nounwind readnone { define i64 @i32_test(i32 %i) nounwind readnone { ; CHECK-LABEL: i32_test: ; CHECK: SelectionDAG has 14 nodes: -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t21: i32,ch = CopyFromReg t0, Register:i32 $r0 ; CHECK-NEXT: t13: ch,glue = CopyToReg t0, Register:i32 $rv, t21 ; CHECK-NEXT: t3: i32,ch = LDW_RI TargetFrameIndex:i32<-1>, TargetConstant:i32<0>, TargetConstant:i32<0>, t0 @@ -48,7 +48,7 @@ define i64 @i32_test(i32 %i) nounwind readnone { define i64 @i16_test(i16 %i) nounwind readnone { ; CHECK-LABEL: i16_test: ; CHECK: SelectionDAG has 19 nodes: -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t33: i32,ch = CopyFromReg t0, Register:i32 $r0 ; CHECK-NEXT: t14: ch,glue = CopyToReg t0, Register:i32 $rv, t33 ; CHECK-NEXT: t1: i32 = ADD_I_LO TargetFrameIndex:i32<-1>, TargetConstant:i32<0> @@ -71,7 +71,7 @@ define i64 @i16_test(i16 %i) nounwind readnone { define i64 @i8_test(i8 %i) nounwind readnone { ; CHECK-LABEL: i8_test: ; CHECK: SelectionDAG has 20 nodes: -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t33: i32,ch = CopyFromReg t0, Register:i32 $r0 ; CHECK-NEXT: t14: ch,glue = CopyToReg t0, Register:i32 $rv, t33 ; CHECK-NEXT: t1: i32 = ADD_I_LO TargetFrameIndex:i32<-1>, TargetConstant:i32<0> diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_isel.ll.expected index 279c8ee..0175556 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_isel.ll.expected @@ -5,7 +5,7 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; PIC-LABEL: i64_test: ; PIC: SelectionDAG has 12 nodes: -; PIC-NEXT: t0: ch = EntryToken +; PIC-NEXT: t0: ch,glue = EntryToken ; PIC-NEXT: t2: i64,ch = CopyFromReg t0, Register:i64 %0 ; PIC-NEXT: t7: i64,i32,ch = ADD64rm t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0 ; PIC-NEXT: t10: ch,glue = CopyToReg t0, Register:i64 $rax, t7 @@ -14,7 +14,7 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; ; WIN-LABEL: i64_test: ; WIN: SelectionDAG has 12 nodes: -; WIN-NEXT: t0: ch = EntryToken +; WIN-NEXT: t0: ch,glue = EntryToken ; WIN-NEXT: t2: i64,ch = CopyFromReg t0, Register:i64 %0 ; WIN-NEXT: t7: i64,i32,ch = ADD64rm t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0 ; WIN-NEXT: t10: ch,glue = CopyToReg t0, Register:i64 $rax, t7 @@ -29,7 +29,7 @@ define i64 @i64_test(i64 %i) nounwind readnone { define i64 @i32_test(i32 %i) nounwind readnone { ; PIC-LABEL: i32_test: ; PIC: SelectionDAG has 15 nodes: -; PIC-NEXT: t0: ch = EntryToken +; PIC-NEXT: t0: ch,glue = EntryToken ; PIC-NEXT: t2: i32,ch = CopyFromReg t0, Register:i32 %0 ; PIC-NEXT: t7: i32,i32,ch = ADD32rm t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0 ; PIC-NEXT: t8: i64 = SUBREG_TO_REG TargetConstant:i64<0>, t7, TargetConstant:i32<6> @@ -39,7 +39,7 @@ define i64 @i32_test(i32 %i) nounwind readnone { ; ; WIN-LABEL: i32_test: ; WIN: SelectionDAG has 15 nodes: -; WIN-NEXT: t0: ch = EntryToken +; WIN-NEXT: t0: ch,glue = EntryToken ; WIN-NEXT: t2: i32,ch = CopyFromReg t0, Register:i32 %0 ; WIN-NEXT: t7: i32,i32,ch = ADD32rm t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0 ; WIN-NEXT: t8: i64 = SUBREG_TO_REG TargetConstant:i64<0>, t7, TargetConstant:i32<6> @@ -56,7 +56,7 @@ define i64 @i32_test(i32 %i) nounwind readnone { define i64 @i16_test(i16 %i) nounwind readnone { ; PIC-LABEL: i16_test: ; PIC: SelectionDAG has 18 nodes: -; PIC-NEXT: t0: ch = EntryToken +; PIC-NEXT: t0: ch,glue = EntryToken ; PIC-NEXT: t2: i32,ch = CopyFromReg t0, Register:i32 %0 ; PIC-NEXT: t3: i16 = EXTRACT_SUBREG t2, TargetConstant:i32<4> ; PIC-NEXT: t8: i16,i32,ch = ADD16rm t3, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0 @@ -68,7 +68,7 @@ define i64 @i16_test(i16 %i) nounwind readnone { ; ; WIN-LABEL: i16_test: ; WIN: SelectionDAG has 16 nodes: -; WIN-NEXT: t0: ch = EntryToken +; WIN-NEXT: t0: ch,glue = EntryToken ; WIN-NEXT: t2: i16,ch = CopyFromReg t0, Register:i16 %0 ; WIN-NEXT: t7: i16,i32,ch = ADD16rm t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0 ; WIN-NEXT: t14: i32 = MOVZX32rr16 t7 @@ -86,7 +86,7 @@ define i64 @i16_test(i16 %i) nounwind readnone { define i64 @i8_test(i8 %i) nounwind readnone { ; PIC-LABEL: i8_test: ; PIC: SelectionDAG has 18 nodes: -; PIC-NEXT: t0: ch = EntryToken +; PIC-NEXT: t0: ch,glue = EntryToken ; PIC-NEXT: t2: i32,ch = CopyFromReg t0, Register:i32 %0 ; PIC-NEXT: t3: i8 = EXTRACT_SUBREG t2, TargetConstant:i32<1> ; PIC-NEXT: t8: i8,i32,ch = ADD8rm t3, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0 @@ -98,7 +98,7 @@ define i64 @i8_test(i8 %i) nounwind readnone { ; ; WIN-LABEL: i8_test: ; WIN: SelectionDAG has 16 nodes: -; WIN-NEXT: t0: ch = EntryToken +; WIN-NEXT: t0: ch,glue = EntryToken ; WIN-NEXT: t2: i8,ch = CopyFromReg t0, Register:i8 %0 ; WIN-NEXT: t7: i8,i32,ch = ADD8rm t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0 ; WIN-NEXT: t14: i32 = MOVZX32rr8 t7 -- 2.7.4