From: Tim Northover Date: Mon, 1 Feb 2021 10:08:40 +0000 (+0000) Subject: ARM: support mandatory tail calls for tailcc & swifttailcc X-Git-Tag: llvmorg-14-init~5355 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d88f96dff3f192fc0c1bf57f7810b95a709b3591;p=platform%2Fupstream%2Fllvm.git ARM: support mandatory tail calls for tailcc & swifttailcc This adds support for callee-pop conventions to the ARM backend so that it can ensure a call marked "tail" is actually a tail call. --- diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 33d5aaf..2f7f3ee 100644 --- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -79,6 +79,11 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_NoRegs_SaveList; } else if (F.getCallingConv() == CallingConv::CFGuard_Check) { return CSR_Win_AAPCS_CFGuard_Check_SaveList; + } else if (F.getCallingConv() == CallingConv::SwiftTail) { + return STI.isTargetDarwin() + ? CSR_iOS_SwiftTail_SaveList + : (UseSplitPush ? CSR_AAPCS_SplitPush_SwiftTail_SaveList + : CSR_AAPCS_SwiftTail_SaveList); } else if (F.hasFnAttribute("interrupt")) { if (STI.isMClass()) { // M-class CPUs have hardware which saves the registers needed to allow a @@ -129,6 +134,10 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF, return CSR_NoRegs_RegMask; if (CC == CallingConv::CFGuard_Check) return CSR_Win_AAPCS_CFGuard_Check_RegMask; + if (CC == CallingConv::SwiftTail) { + return STI.isTargetDarwin() ? CSR_iOS_SwiftTail_RegMask + : CSR_AAPCS_SwiftTail_RegMask; + } if (STI.getTargetLowering()->supportSwiftError() && MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError)) return STI.isTargetDarwin() ? CSR_iOS_SwiftError_RegMask diff --git a/llvm/lib/Target/ARM/ARMCallingConv.td b/llvm/lib/Target/ARM/ARMCallingConv.td index 3517274..a6dbe56 100644 --- a/llvm/lib/Target/ARM/ARMCallingConv.td +++ b/llvm/lib/Target/ARM/ARMCallingConv.td @@ -278,6 +278,9 @@ def CSR_Win_AAPCS_CFGuard_Check : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, // R8 is used to pass swifterror, remove it from CSR. def CSR_AAPCS_SwiftError : CalleeSavedRegs<(sub CSR_AAPCS, R8)>; +// R10 is used to pass swiftself, remove it from CSR. +def CSR_AAPCS_SwiftTail : CalleeSavedRegs<(sub CSR_AAPCS, R10)>; + // The order of callee-saved registers needs to match the order we actually push // them in FrameLowering, because this order is what's used by // PrologEpilogInserter to allocate frame index slots. So when R7 is the frame @@ -290,6 +293,10 @@ def CSR_AAPCS_SplitPush : CalleeSavedRegs<(add LR, R7, R6, R5, R4, def CSR_AAPCS_SplitPush_SwiftError : CalleeSavedRegs<(sub CSR_AAPCS_SplitPush, R8)>; +// R10 is used to pass swifterror, remove it from CSR. +def CSR_AAPCS_SplitPush_SwiftTail : CalleeSavedRegs<(sub CSR_AAPCS_SplitPush, + R10)>; + // Constructors and destructors return 'this' in the ARM C++ ABI; since 'this' // and the pointer return value are both passed in R0 in these cases, this can // be partially modelled by treating R0 as a callee-saved register @@ -305,6 +312,9 @@ def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>; // R8 is used to pass swifterror, remove it from CSR. def CSR_iOS_SwiftError : CalleeSavedRegs<(sub CSR_iOS, R8)>; +// R10 is used to pass swiftself, remove it from CSR. +def CSR_iOS_SwiftTail : CalleeSavedRegs<(sub CSR_iOS, R10)>; + def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS_ThisReturn, R9))>; diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index deea297..8d6bc06 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -2037,7 +2037,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, } auto NewMI = std::prev(MBBI); - for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i) + for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i) NewMI->addOperand(MBBI->getOperand(i)); diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp index 73cb3a2..32c54e8 100644 --- a/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -1849,6 +1849,7 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, } case CallingConv::ARM_AAPCS_VFP: case CallingConv::Swift: + case CallingConv::SwiftTail: if (!isVarArg) return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); // Fall through to soft float variant, variadic functions don't @@ -3014,6 +3015,7 @@ bool ARMFastISel::fastLowerArguments() { case CallingConv::ARM_AAPCS: case CallingConv::ARM_APCS: case CallingConv::Swift: + case CallingConv::SwiftTail: break; } diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp index fea7718..c115cc1 100644 --- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -237,6 +237,41 @@ ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { return hasReservedCallFrame(MF) || MF.getFrameInfo().hasVarSizedObjects(); } +// Returns how much of the incoming argument stack area we should clean up in an +// epilogue. For the C calling convention this will be 0, for guaranteed tail +// call conventions it can be positive (a normal return or a tail call to a +// function that uses less stack space for arguments) or negative (for a tail +// call to a function that needs more stack space than us for arguments). +static int getArgumentStackToRestore(MachineFunction &MF, + MachineBasicBlock &MBB) { + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + bool IsTailCallReturn = false; + if (MBB.end() != MBBI) { + unsigned RetOpcode = MBBI->getOpcode(); + IsTailCallReturn = RetOpcode == ARM::TCRETURNdi || + RetOpcode == ARM::TCRETURNri; + } + ARMFunctionInfo *AFI = MF.getInfo(); + + int ArgumentPopSize = 0; + if (IsTailCallReturn) { + MachineOperand &StackAdjust = MBBI->getOperand(1); + + // For a tail-call in a callee-pops-arguments environment, some or all of + // the stack may actually be in use for the call's arguments, this is + // calculated during LowerCall and consumed here... + ArgumentPopSize = StackAdjust.getImm(); + } else { + // ... otherwise the amount to pop is *all* of the argument space, + // conveniently stored in the MachineFunctionInfo by + // LowerFormalArguments. This will, of course, be zero for the C calling + // convention. + ArgumentPopSize = AFI->getArgumentStackToRestore(); + } + + return ArgumentPopSize; +} + static void emitRegPlusImmediate( bool isARM, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, const DebugLoc &dl, const ARMBaseInstrInfo &TII, unsigned DestReg, @@ -868,7 +903,13 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, "This emitEpilogue does not support Thumb1!"); bool isARM = !AFI->isThumbFunction(); - unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); + // Amount of stack space we reserved next to incoming args for either + // varargs registers or stack arguments in tail calls made by this function. + unsigned ReservedArgStack = AFI->getArgRegsSaveSize(); + + // How much of the stack used by incoming arguments this function is expected + // to restore in this particular epilogue. + int IncomingArgStackToRestore = getArgumentStackToRestore(MF, MBB); int NumBytes = (int)MFI.getStackSize(); Register FramePtr = RegInfo->getFrameRegister(MF); @@ -882,8 +923,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); if (!AFI->hasStackFrame()) { - if (NumBytes - ArgRegsSaveSize != 0) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize, + if (NumBytes - ReservedArgStack != 0) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ReservedArgStack, MachineInstr::FrameDestroy); } else { // Unwind MBBI to point to first LDR / VLDRD. @@ -897,7 +938,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, } // Move SP to start of FP callee save spill area. - NumBytes -= (ArgRegsSaveSize + + NumBytes -= (ReservedArgStack + AFI->getFPCXTSaveAreaSize() + AFI->getGPRCalleeSavedArea1Size() + AFI->getGPRCalleeSavedArea2Size() + @@ -969,9 +1010,13 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, if (AFI->getFPCXTSaveAreaSize()) MBBI++; } - if (ArgRegsSaveSize) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize, + if (ReservedArgStack || IncomingArgStackToRestore) { + assert(ReservedArgStack + IncomingArgStackToRestore >= 0 && + "attempting to restore negative stack amount"); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, + ReservedArgStack + IncomingArgStackToRestore, MachineInstr::FrameDestroy); + } } /// getFrameIndexReference - Provide a base+offset reference to an FI slot for @@ -2288,31 +2333,37 @@ MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr( MachineBasicBlock::iterator I) const { const ARMBaseInstrInfo &TII = *static_cast(MF.getSubtarget().getInstrInfo()); + ARMFunctionInfo *AFI = MF.getInfo(); + bool isARM = !AFI->isThumbFunction(); + DebugLoc dl = I->getDebugLoc(); + unsigned Opc = I->getOpcode(); + bool IsDestroy = Opc == TII.getCallFrameDestroyOpcode(); + unsigned CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; + + assert(!AFI->isThumb1OnlyFunction() && + "This eliminateCallFramePseudoInstr does not support Thumb1!"); + + int PIdx = I->findFirstPredOperandIdx(); + ARMCC::CondCodes Pred = (PIdx == -1) + ? ARMCC::AL + : (ARMCC::CondCodes)I->getOperand(PIdx).getImm(); + unsigned PredReg = TII.getFramePred(*I); + if (!hasReservedCallFrame(MF)) { + // Bail early if the callee is expected to do the adjustment. + if (IsDestroy && CalleePopAmount != -1U) + return MBB.erase(I); + // If we have alloca, convert as follows: // ADJCALLSTACKDOWN -> sub, sp, sp, amount // ADJCALLSTACKUP -> add, sp, sp, amount - MachineInstr &Old = *I; - DebugLoc dl = Old.getDebugLoc(); - unsigned Amount = TII.getFrameSize(Old); + unsigned Amount = TII.getFrameSize(*I); if (Amount != 0) { // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. Amount = alignSPAdjust(Amount); - ARMFunctionInfo *AFI = MF.getInfo(); - assert(!AFI->isThumb1OnlyFunction() && - "This eliminateCallFramePseudoInstr does not support Thumb1!"); - bool isARM = !AFI->isThumbFunction(); - - // Replace the pseudo instruction with a new instruction... - unsigned Opc = Old.getOpcode(); - int PIdx = Old.findFirstPredOperandIdx(); - ARMCC::CondCodes Pred = - (PIdx == -1) ? ARMCC::AL - : (ARMCC::CondCodes)Old.getOperand(PIdx).getImm(); - unsigned PredReg = TII.getFramePred(Old); if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) { emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, MachineInstr::NoFlags, Pred, PredReg); @@ -2322,6 +2373,11 @@ MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr( Pred, PredReg); } } + } else if (CalleePopAmount != -1U) { + // If the calling convention demands that the callee pops arguments from the + // stack, we want to add it back if we have a reserved call frame. + emitSPUpdate(isARM, MBB, I, dl, TII, -CalleePopAmount, + MachineInstr::NoFlags, Pred, PredReg); } return MBB.erase(I); } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 3f4321b..4dee743 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -2008,6 +2008,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, case CallingConv::SwiftTail: return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; case CallingConv::C: + case CallingConv::Tail: if (!Subtarget->isAAPCS_ABI()) return CallingConv::ARM_APCS; else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && @@ -2184,19 +2185,31 @@ SDValue ARMTargetLowering::LowerCallResult( return Chain; } -/// LowerMemOpCallTo - Store the argument to the stack. -SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, - SDValue Arg, const SDLoc &dl, - SelectionDAG &DAG, - const CCValAssign &VA, - ISD::ArgFlagsTy Flags) const { - unsigned LocMemOffset = VA.getLocMemOffset(); - SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); - PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), - StackPtr, PtrOff); - return DAG.getStore( - Chain, dl, Arg, PtrOff, - MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); +std::pair ARMTargetLowering::computeAddrForCallArg( + const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr, + bool IsTailCall, int SPDiff) const { + SDValue DstAddr; + MachinePointerInfo DstInfo; + int32_t Offset = VA.getLocMemOffset(); + MachineFunction &MF = DAG.getMachineFunction(); + + if (IsTailCall) { + Offset += SPDiff; + auto PtrVT = getPointerTy(DAG.getDataLayout()); + int Size = VA.getLocVT().getFixedSizeInBits() / 8; + int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true); + DstAddr = DAG.getFrameIndex(FI, PtrVT); + DstInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); + } else { + SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl); + DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + StackPtr, PtrOff); + DstInfo = + MachinePointerInfo::getStack(DAG.getMachineFunction(), Offset); + } + + return std::make_pair(DstAddr, DstInfo); } void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, @@ -2205,7 +2218,8 @@ void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, CCValAssign &VA, CCValAssign &NextVA, SDValue &StackPtr, SmallVectorImpl &MemOpChains, - ISD::ArgFlagsTy Flags) const { + bool IsTailCall, + int SPDiff) const { SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Arg); unsigned id = Subtarget->isLittle() ? 0 : 1; @@ -2219,12 +2233,20 @@ void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); - MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), - dl, DAG, NextVA, - Flags)); + SDValue DstAddr; + MachinePointerInfo DstInfo; + std::tie(DstAddr, DstInfo) = + computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff); + MemOpChains.push_back( + DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo)); } } +static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) { + return (CC == CallingConv::Fast && GuaranteeTailCalls) || + CC == CallingConv::Tail || CC == CallingConv::SwiftTail; +} + /// LowerCall - Lowering a call into a callseq_start <- /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter /// nodes. @@ -2249,6 +2271,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool isThisReturn = false; bool isCmseNSCall = false; + bool isSibCall = false; bool PreferIndirect = false; // Determine whether this is a non-secure function call. @@ -2288,6 +2311,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); + + if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail) + isSibCall = true; + // We don't support GuaranteedTailCallOpt for ARM, only automatically // detected sibcalls. if (isTailCall) @@ -2303,13 +2331,40 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); - if (isTailCall) { - // For tail calls, memory operands are available in our caller's stack. + // SPDiff is the byte offset of the call's argument area from the callee's. + // Stores to callee stack arguments will be placed in FixedStackSlots offset + // by this amount for a tail call. In a sibling call it must be 0 because the + // caller will deallocate the entire stack and the callee still expects its + // arguments to begin at SP+0. Completely unused for non-tail calls. + int SPDiff = 0; + + if (isTailCall && !isSibCall) { + auto FuncInfo = MF.getInfo(); + unsigned NumReusableBytes = FuncInfo->getArgumentStackSize(); + + // Since callee will pop argument stack as a tail call, we must keep the + // popped size 16-byte aligned. + Align StackAlign = DAG.getDataLayout().getStackAlignment(); + NumBytes = alignTo(NumBytes, StackAlign); + + // SPDiff will be negative if this tail call requires more space than we + // would automatically have in our incoming argument space. Positive if we + // can actually shrink the stack. + SPDiff = NumReusableBytes - NumBytes; + + // If this call requires more stack than we have available from + // LowerFormalArguments, tell FrameLowering to reserve space for it. + if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff) + AFI->setArgRegsSaveSize(-SPDiff); + } + + if (isSibCall) { + // For sibling tail calls, memory operands are available in our caller's stack. NumBytes = 0; } else { // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); + Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl); } SDValue StackPtr = @@ -2318,6 +2373,13 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPassVector RegsToPass; SmallVector MemOpChains; + // During a tail call, stores to the argument area must happen after all of + // the function's incoming arguments have been loaded because they may alias. + // This is done by folding in a TokenFactor from LowerFormalArguments, but + // there's no point in doing so repeatedly so this tracks whether that's + // happened yet. + bool AfterFormalArgLoads = false; + // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization, arguments are handled later. for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); @@ -2346,6 +2408,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, break; } + if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) { + Chain = DAG.getStackArgumentTokenFactor(Chain); + AfterFormalArgLoads = true; + } + // f16 arguments have their size extended to 4 bytes and passed as if they // had been copied to the LSBs of a 32-bit register. // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) @@ -2375,21 +2442,23 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getConstant(1, dl, MVT::i32)); PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i], - StackPtr, MemOpChains, Flags); + StackPtr, MemOpChains, isTailCall, SPDiff); VA = ArgLocs[++i]; // skip ahead to next loc if (VA.isRegLoc()) { PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i], - StackPtr, MemOpChains, Flags); + StackPtr, MemOpChains, isTailCall, SPDiff); } else { assert(VA.isMemLoc()); - - MemOpChains.push_back( - LowerMemOpCallTo(Chain, StackPtr, Op1, dl, DAG, VA, Flags)); + SDValue DstAddr; + MachinePointerInfo DstInfo; + std::tie(DstAddr, DstInfo) = + computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff); + MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo)); } } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) { PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], - StackPtr, MemOpChains, Flags); + StackPtr, MemOpChains, isTailCall, SPDiff); } else if (VA.isRegLoc()) { if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && Outs[0].VT == MVT::i32) { @@ -2439,9 +2508,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (Flags.getByValSize() > 4*offset) { auto PtrVT = getPointerTy(DAG.getDataLayout()); - unsigned LocMemOffset = VA.getLocMemOffset(); - SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); - SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); + SDValue Dst; + MachinePointerInfo DstInfo; + std::tie(Dst, DstInfo) = + computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff); SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, @@ -2454,11 +2524,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops)); } - } else if (!isTailCall) { + } else { assert(VA.isMemLoc()); + SDValue DstAddr; + MachinePointerInfo DstInfo; + std::tie(DstAddr, DstInfo) = + computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff); - MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, - dl, DAG, VA, Flags)); + SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo); + MemOpChains.push_back(Store); } } @@ -2622,10 +2696,24 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; } + // We don't usually want to end the call-sequence here because we would tidy + // the frame up *after* the call, however in the ABI-changing tail-call case + // we've carefully laid out the parameters so that when sp is reset they'll be + // in the correct location. + if (isTailCall && !isSibCall) { + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), + DAG.getIntPtrConstant(0, dl, true), InFlag, dl); + InFlag = Chain.getValue(1); + } + std::vector Ops; Ops.push_back(Chain); Ops.push_back(Callee); + if (isTailCall) { + Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32)); + } + // Add argument registers to the end of the list so that they are known live // into the call. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) @@ -2670,8 +2758,16 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InFlag = Chain.getValue(1); DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); + // If we're guaranteeing tail-calls will be honoured, the callee must + // pop its own argument stack on return. But this call is *not* a tail call so + // we need to undo that after it returns to restore the status-quo. + bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt; + uint64_t CalleePopBytes = + canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL; + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), - DAG.getIntPtrConstant(0, dl, true), InFlag, dl); + DAG.getIntPtrConstant(CalleePopBytes, dl, true), + InFlag, dl); if (!Ins.empty()) InFlag = Chain.getValue(1); @@ -2812,6 +2908,9 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization( if (CallerF.hasFnAttribute("interrupt")) return false; + if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt)) + return CalleeCC == CallerCC; + // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. if (isCalleeStructRet || isCallerStructRet) @@ -4460,7 +4559,17 @@ SDValue ARMTargetLowering::LowerFormalArguments( } } - AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); + unsigned StackArgSize = CCInfo.getNextStackOffset(); + bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; + if (canGuaranteeTCO(CallConv, TailCallOpt)) { + // The only way to guarantee a tail call is if the callee restores its + // argument area, but it must also keep the stack aligned when doing so. + const DataLayout &DL = DAG.getDataLayout(); + StackArgSize = alignTo(StackArgSize, DL.getStackAlignment()); + + AFI->setArgumentStackToRestore(StackArgSize); + } + AFI->setArgumentStackSize(StackArgSize); if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) { DiagnosticInfoUnsupported Diag( diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 80e4e12..5b4a96d 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -756,7 +756,8 @@ class VectorType; CCValAssign &VA, CCValAssign &NextVA, SDValue &StackPtr, SmallVectorImpl &MemOpChains, - ISD::ArgFlagsTy Flags) const; + bool IsTailCall, + int SPDiff) const; SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, SDValue &Root, SelectionDAG &DAG, const SDLoc &dl) const; @@ -765,10 +766,10 @@ class VectorType; bool isVarArg) const; CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return, bool isVarArg) const; - SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, - const SDLoc &dl, SelectionDAG &DAG, - const CCValAssign &VA, - ISD::ArgFlagsTy Flags) const; + std::pair + computeAddrForCallArg(const SDLoc &dl, SelectionDAG &DAG, + const CCValAssign &VA, SDValue StackPtr, + bool IsTailCall, int SPDiff) const; SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 6b61c59..65c7fb5 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -66,7 +66,7 @@ def SDT_ARMMEMBARRIER : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_ARMPREFETCH : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>, SDTCisInt<1>]>; -def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; +def SDT_ARMTCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>; def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; @@ -2629,10 +2629,10 @@ def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func", // Tail calls. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { - def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>, + def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst, i32imm:$SPDiff), IIC_Br, []>, Sched<[WriteBr]>; - def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>, + def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst, i32imm:$SPDiff), IIC_Br, []>, Sched<[WriteBr]>; def TAILJMPd : ARMPseudoExpand<(outs), (ins arm_br_target:$dst), @@ -6003,9 +6003,12 @@ def : ARMPat<(ARMWrapperJT tjumptable:$dst), // TODO: add,sub,and, 3-instr forms? // Tail calls. These patterns also apply to Thumb mode. -def : Pat<(ARMtcret tcGPR:$dst), (TCRETURNri tcGPR:$dst)>; -def : Pat<(ARMtcret (i32 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>; -def : Pat<(ARMtcret (i32 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>; +def : Pat<(ARMtcret tcGPR:$dst, (i32 timm:$SPDiff)), + (TCRETURNri tcGPR:$dst, timm:$SPDiff)>; +def : Pat<(ARMtcret (i32 tglobaladdr:$dst), (i32 timm:$SPDiff)), + (TCRETURNdi texternalsym:$dst, (i32 timm:$SPDiff))>; +def : Pat<(ARMtcret (i32 texternalsym:$dst), (i32 timm:$SPDiff)), + (TCRETURNdi texternalsym:$dst, i32imm:$SPDiff)>; // Direct calls def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>; diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index 298c8a2..8516552 100644 --- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -43,7 +43,9 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// "attach" GPR-part to the part that was passed via stack. unsigned StByValParamsPadding = 0; - /// VarArgsRegSaveSize - Size of the register save area for vararg functions. + /// ArgsRegSaveSize - Size of the register save area for vararg functions or + /// those making guaranteed tail calls that need more stack argument space + /// than is provided by this functions incoming parameters. /// unsigned ArgRegsSaveSize = 0; @@ -118,6 +120,10 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// being passed on the stack unsigned ArgumentStackSize = 0; + /// ArgumentStackToRestore - amount of bytes on stack consumed that we must + /// restore on return. + unsigned ArgumentStackToRestore = 0; + /// CoalescedWeights - mapping of basic blocks to the rolling counter of /// coalesced weights. DenseMap CoalescedWeights; @@ -195,6 +201,9 @@ public: unsigned getArgumentStackSize() const { return ArgumentStackSize; } void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; } + unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; } + void setArgumentStackToRestore(unsigned v) { ArgumentStackToRestore = v; } + void initPICLabelUId(unsigned UId) { PICLabelUId = UId; } diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index 5cb608b..90f1b693 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -230,7 +230,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { // registers are the 4 used for parameters. We don't currently do this // case. - SupportsTailCall = !isThumb() || hasV8MBaselineOps(); + SupportsTailCall = !isThumb1Only() || hasV8MBaselineOps(); if (isTargetMachO() && isTargetIOS() && getTargetTriple().isOSVersionLT(5, 0)) SupportsTailCall = false; diff --git a/llvm/test/CodeGen/ARM/dbg-tcreturn.ll b/llvm/test/CodeGen/ARM/dbg-tcreturn.ll index d4061be..037fda1 100644 --- a/llvm/test/CodeGen/ARM/dbg-tcreturn.ll +++ b/llvm/test/CodeGen/ARM/dbg-tcreturn.ll @@ -12,7 +12,7 @@ target triple = "thumbv7-apple-ios7.0.0" ; CHECK-NEXT: $r0 = COPY %0 ; CHECK-NEXT: $r1 = COPY %1 ; CHECK-NEXT: DBG_VALUE $noreg, $noreg, !13, !DIExpression(), debug-location !16 -; CHECK-NEXT: TCRETURNdi &__divsi3, implicit $sp, implicit $r0, implicit $r1 +; CHECK-NEXT: TCRETURNdi &__divsi3, 0, implicit $sp, implicit $r0, implicit $r1 define i32 @test(i32 %a1, i32 %a2) !dbg !5 { entry: diff --git a/llvm/test/CodeGen/ARM/fastcc-tailcall.ll b/llvm/test/CodeGen/ARM/fastcc-tailcall.ll new file mode 100644 index 0000000..fc07172 --- /dev/null +++ b/llvm/test/CodeGen/ARM/fastcc-tailcall.ll @@ -0,0 +1,193 @@ +; RUN: llc -verify-machineinstrs < %s -mtriple=thumbv7k-apple-watchos -tailcallopt | FileCheck %s + +declare fastcc void @callee_stack0() +declare fastcc void @callee_stack4([4 x i32], i32) +declare fastcc void @callee_stack20([4 x i32], [5 x i32]) +declare extern_weak fastcc void @callee_weak() + +define fastcc void @caller_to0_from0() nounwind { +; CHECK-LABEL: _caller_to0_from0: + + tail call fastcc void @callee_stack0() + ret void +; CHECK-NOT: add +; CHECK-NOT: sub +; CHECK: b.w _callee_stack0 +} + +define fastcc void @caller_to0_from4([4 x i32], i32) { +; CHECK-LABEL: _caller_to0_from4: + + tail call fastcc void @callee_stack0() + ret void + +; CHECK: add sp, #16 +; CHECK-NEXT: b.w _callee_stack0 +} + +define fastcc void @caller_to4_from0() { +; Key point is that the "42" should go #16 below incoming stack +; pointer (we didn't have arg space to reuse). + tail call fastcc void @callee_stack4([4 x i32] undef, i32 42) + ret void + +; CHECK-LABEL: _caller_to4_from0: +; CHECK: sub sp, #16 +; CHECK: movs [[TMP:r[0-9]+]], #42 +; CHECK: str [[TMP]], [sp] +; CHECK-NOT: add sp +; CHECK: b.w _callee_stack4 + +} + +define fastcc void @caller_to4_from4([4 x i32], i32 %a) { +; CHECK-LABEL: _caller_to4_from4: +; CHECK-NOT: sub sp +; Key point is that the "%a" should go where at SP on entry. + tail call fastcc void @callee_stack4([4 x i32] undef, i32 42) + ret void + +; CHECK: str {{r[0-9]+}}, [sp] +; CHECK-NOT: add sp +; CHECK-NEXT: b.w _callee_stack4 +} + +define fastcc void @caller_to20_from4([4 x i32], i32 %a) { +; CHECK-LABEL: _caller_to20_from4: +; CHECK: sub sp, #16 + +; Important point is that the call reuses the "dead" argument space +; above %a on the stack. If it tries to go below incoming-SP then the +; _callee will not deallocate the space, even in fastcc. + tail call fastcc void @callee_stack20([4 x i32] undef, [5 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5]) + +; CHECK: str {{.*}}, [sp] +; CHECK: str {{.*}}, [sp, #4] +; CHECK: str {{.*}}, [sp, #8] +; CHECK: str {{.*}}, [sp, #12] +; CHECK: str {{.*}}, [sp, #16] +; CHECK-NOT: add sp +; CHECK-NOT: sub sp +; CHECK: b.w _callee_stack20 + ret void +} + + +define fastcc void @caller_to4_from24([4 x i32], i64 %a, i64 %b, i64 %c) { +; CHECK-LABEL: _caller_to4_from24: + + +; Key point is that the "%a" should go where at #16 above SP on entry. + tail call fastcc void @callee_stack4([4 x i32] undef, i32 42) + ret void + +; CHECK: str {{.*}}, [sp, #16] +; CHECK: add sp, #16 +; CHECK-NEXT: b.w _callee_stack4 +} + + +define fastcc void @caller_to20_from20([4 x i32], [5 x i32] %a) { +; CHECK-LABEL: _caller_to20_from20: +; CHECK-NOT: add sp, +; CHECK-NOT: sub sp, + +; Here we want to make sure that both loads happen before the stores: +; otherwise either %a or %b.w will be wrongly clobbered. + tail call fastcc void @callee_stack20([4 x i32] undef, [5 x i32] %a) + ret void + + ; If these ever get interleaved make sure aliasing slots don't clobber each + ; other. +; CHECK: ldrd {{.*}}, {{.*}}, [sp, #12] +; CHECK: ldm.w sp, +; CHECK: stm.w +; CHECK: strd +; CHECK-NEXT: b.w _callee_stack20 +} + +define fastcc void @disable_tail_calls() nounwind "disable-tail-calls"="true" { +; CHECK-LABEL: disable_tail_calls: + + tail call fastcc void @callee_stack0() + ret void + +; CHECK: bl _callee_stack0 +; CHECK: ret +} + +define fastcc void @normal_ret_with_stack([4 x i32], i32 %a) { +; CHECK: _normal_ret_with_stack: +; CHECK: add sp, #16 +; CHECK: bx lr + ret void +} + +declare { [2 x float] } @get_vec2() + +define void @fromC_totail() { +; COMMON-LABEL: fromC_totail: +; COMMON: puch {r4, lr} +; COMMON: sub sp, #8 + +; COMMON-NOT: sub sp, +; COMMON: movs [[TMP:r[0-9]+]], #42 +; COMMON: str [[TMP]], [sp] +; COMMON: bl _callee_stack4 + ; We must reset the stack to where it was before the call by undoing its extra stack pop. +; COMMON: sub sp, #16 +; COMMON: str [[TMP]], [sp] +; COMMON: bl callee_stack4 +; COMMON: sub sp, #16 + + call fastcc void @callee_stack4([4 x i32] undef, i32 42) + call fastcc void @callee_stack4([4 x i32] undef, i32 42) + ret void +} + +define void @fromC_totail_noreservedframe(i32 %len) { +; COMMON-LABEL: fromC_totail_noreservedframe: +; COMMON: sub.w sp, sp, r{{.*}} + +; COMMON: movs [[TMP:r[0-9]+]], #42 + ; Note stack is subtracted here to allocate space for arg +; COMMON: sub.w sp, #16 +; COMMON: str [[TMP]], [sp] +; COMMON: bl _callee_stack4 + ; And here. +; COMMON: sub sp, #16 +; COMMON: str [[TMP]], [sp] +; COMMON: bl _callee_stack4 + ; But not restored here because callee_stack8 did that for us. +; COMMON-NOT: sub sp, + + ; Variable sized allocation prevents reserving frame at start of function so each call must allocate any stack space it needs. + %var = alloca i32, i32 %len + + call fastcc void @callee_stack4([4 x i32] undef, i32 42) + call fastcc void @callee_stack4([4 x i32] undef, i32 42) + ret void +} + +declare void @Ccallee_stack4([4 x i32], i32) + +define fastcc void @fromtail_toC() { +; COMMON-LABEL: fromtail_toC: +; COMMON: push {r4, lr} +; COMMON: sub sp, #8 + +; COMMON-NOT: sub sp, +; COMMON: movs [[TMP:r[0-9]+]], #42 +; COMMON: str [[TMP]], [sp] +; COMMON: bl _Ccallee_stack4 + ; C callees will return with the stack exactly where we left it, so we mustn't try to fix anything. +; COMMON-NOT: add sp, +; COMMON-NOT: sub sp, +; COMMON: str [[TMP]], [sp]{{$}} +; COMMON: bl _Ccallee_stack4 +; COMMON-NOT: sub sp, + + call void @Ccallee_stack4([4 x i32] undef, i32 42) + call void @Ccallee_stack4([4 x i32] undef, i32 42) + ret void +} diff --git a/llvm/test/CodeGen/ARM/peephole-callee-save-regalloc.mir b/llvm/test/CodeGen/ARM/peephole-callee-save-regalloc.mir index 104c887..8ee4a80 100644 --- a/llvm/test/CodeGen/ARM/peephole-callee-save-regalloc.mir +++ b/llvm/test/CodeGen/ARM/peephole-callee-save-regalloc.mir @@ -41,5 +41,5 @@ body: | $r1 = COPY %1 $r2 = COPY %2 $r3 = COPY %3 - TCRETURNri killed %5, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3 + TCRETURNri killed %5, 0, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3 ... diff --git a/llvm/test/CodeGen/ARM/swifttailcc-call.ll b/llvm/test/CodeGen/ARM/swifttailcc-call.ll new file mode 100644 index 0000000..2514e26 --- /dev/null +++ b/llvm/test/CodeGen/ARM/swifttailcc-call.ll @@ -0,0 +1,201 @@ +; RUN: llc -verify-machineinstrs < %s -mtriple=thumbv7k-apple-watchos | FileCheck %s + +declare swifttailcc void @callee_stack0() +declare swifttailcc void @callee_stack4([4 x i32], i32) +declare swifttailcc void @callee_stack20([4 x i32], [5 x i32]) +declare extern_weak swifttailcc void @callee_weak() + +define swifttailcc void @caller_to0_from0() nounwind { +; CHECK-LABEL: _caller_to0_from0: + + tail call swifttailcc void @callee_stack0() + ret void +; CHECK-NOT: add +; CHECK-NOT: sub +; CHECK: b.w _callee_stack0 +} + +define swifttailcc void @caller_to0_from4([4 x i32], i32) { +; CHECK-LABEL: _caller_to0_from4: + + tail call swifttailcc void @callee_stack0() + ret void + +; CHECK: add sp, #16 +; CHECK-NEXT: b.w _callee_stack0 +} + +define swifttailcc void @caller_to4_from0() { +; Key point is that the "42" should go #16 below incoming stack +; pointer (we didn't have arg space to reuse). + tail call swifttailcc void @callee_stack4([4 x i32] undef, i32 42) + ret void + +; CHECK-LABEL: _caller_to4_from0: +; CHECK: sub sp, #16 +; CHECK: movs [[TMP:r[0-9]+]], #42 +; CHECK: str [[TMP]], [sp] +; CHECK-NOT: add sp +; CHECK: b.w _callee_stack4 + +} + +define swifttailcc void @caller_to4_from4([4 x i32], i32 %a) { +; CHECK-LABEL: _caller_to4_from4: +; CHECK-NOT: sub sp +; Key point is that the "%a" should go where at SP on entry. + tail call swifttailcc void @callee_stack4([4 x i32] undef, i32 42) + ret void + +; CHECK: str {{r[0-9]+}}, [sp] +; CHECK-NOT: add sp +; CHECK-NEXT: b.w _callee_stack4 +} + +define swifttailcc void @caller_to20_from4([4 x i32], i32 %a) { +; CHECK-LABEL: _caller_to20_from4: +; CHECK: sub sp, #16 + +; Important point is that the call reuses the "dead" argument space +; above %a on the stack. If it tries to go below incoming-SP then the +; _callee will not deallocate the space, even in swifttailcc. + tail call swifttailcc void @callee_stack20([4 x i32] undef, [5 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5]) + +; CHECK: str {{.*}}, [sp] +; CHECK: str {{.*}}, [sp, #4] +; CHECK: str {{.*}}, [sp, #8] +; CHECK: str {{.*}}, [sp, #12] +; CHECK: str {{.*}}, [sp, #16] +; CHECK-NOT: add sp +; CHECK-NOT: sub sp +; CHECK: b.w _callee_stack20 + ret void +} + + +define swifttailcc void @caller_to4_from24([4 x i32], i64 %a, i64 %b, i64 %c) { +; CHECK-LABEL: _caller_to4_from24: + + +; Key point is that the "%a" should go where at #16 above SP on entry. + tail call swifttailcc void @callee_stack4([4 x i32] undef, i32 42) + ret void + +; CHECK: str {{.*}}, [sp, #16] +; CHECK: add sp, #16 +; CHECK-NEXT: b.w _callee_stack4 +} + + +define swifttailcc void @caller_to20_from20([4 x i32], [5 x i32] %a) { +; CHECK-LABEL: _caller_to20_from20: +; CHECK-NOT: add sp, +; CHECK-NOT: sub sp, + +; Here we want to make sure that both loads happen before the stores: +; otherwise either %a or %b.w will be wrongly clobbered. + tail call swifttailcc void @callee_stack20([4 x i32] undef, [5 x i32] %a) + ret void + + ; If these ever get interleaved make sure aliasing slots don't clobber each + ; other. +; CHECK: ldrd {{.*}}, {{.*}}, [sp, #12] +; CHECK: ldm.w sp, +; CHECK: stm.w +; CHECK: strd +; CHECK-NEXT: b.w _callee_stack20 +} + +define swifttailcc void @disable_tail_calls() nounwind "disable-tail-calls"="true" { +; CHECK-LABEL: disable_tail_calls: + + tail call swifttailcc void @callee_stack0() + ret void + +; CHECK: bl _callee_stack0 +; CHECK: ret +} + +define swifttailcc void @normal_ret_with_stack([4 x i32], i32 %a) { +; CHECK: _normal_ret_with_stack: +; CHECK: add sp, #16 +; CHECK: bx lr + ret void +} + +declare { [2 x float] } @get_vec2() + +define void @fromC_totail() { +; COMMON-LABEL: fromC_totail: +; COMMON: puch {r4, lr} +; COMMON: sub sp, #8 + +; COMMON-NOT: sub sp, +; COMMON: movs [[TMP:r[0-9]+]], #42 +; COMMON: str [[TMP]], [sp] +; COMMON: bl _callee_stack4 + ; We must reset the stack to where it was before the call by undoing its extra stack pop. +; COMMON: sub sp, #16 +; COMMON: str [[TMP]], [sp] +; COMMON: bl callee_stack4 +; COMMON: sub sp, #16 + + call swifttailcc void @callee_stack4([4 x i32] undef, i32 42) + call swifttailcc void @callee_stack4([4 x i32] undef, i32 42) + ret void +} + +define void @fromC_totail_noreservedframe(i32 %len) { +; COMMON-LABEL: fromC_totail_noreservedframe: +; COMMON: sub.w sp, sp, r{{.*}} + +; COMMON: movs [[TMP:r[0-9]+]], #42 + ; Note stack is subtracted here to allocate space for arg +; COMMON: sub.w sp, #16 +; COMMON: str [[TMP]], [sp] +; COMMON: bl _callee_stack4 + ; And here. +; COMMON: sub sp, #16 +; COMMON: str [[TMP]], [sp] +; COMMON: bl _callee_stack4 + ; But not restored here because callee_stack8 did that for us. +; COMMON-NOT: sub sp, + + ; Variable sized allocation prevents reserving frame at start of function so each call must allocate any stack space it needs. + %var = alloca i32, i32 %len + + call swifttailcc void @callee_stack4([4 x i32] undef, i32 42) + call swifttailcc void @callee_stack4([4 x i32] undef, i32 42) + ret void +} + +declare void @Ccallee_stack4([4 x i32], i32) + +define swifttailcc void @fromtail_toC() { +; COMMON-LABEL: fromtail_toC: +; COMMON: push {r4, lr} +; COMMON: sub sp, #8 + +; COMMON-NOT: sub sp, +; COMMON: movs [[TMP:r[0-9]+]], #42 +; COMMON: str [[TMP]], [sp] +; COMMON: bl _Ccallee_stack4 + ; C callees will return with the stack exactly where we left it, so we mustn't try to fix anything. +; COMMON-NOT: add sp, +; COMMON-NOT: sub sp, +; COMMON: str [[TMP]], [sp]{{$}} +; COMMON: bl _Ccallee_stack4 +; COMMON-NOT: sub sp, + + call void @Ccallee_stack4([4 x i32] undef, i32 42) + call void @Ccallee_stack4([4 x i32] undef, i32 42) + ret void +} + +declare swifttailcc i8* @SwiftSelf(i8 * swiftasync %context, i8* swiftself %closure) +define swiftcc i8* @CallSwiftSelf(i8* swiftself %closure, i8* %context) { +; CHECK-LABEL: CallSwiftSelf: +; CHECK: push{{.*}}r10 + %res = call swifttailcc i8* @SwiftSelf(i8 * swiftasync %context, i8* swiftself %closure) + ret i8* %res +} diff --git a/llvm/test/CodeGen/ARM/swifttailcc-fastisel.ll b/llvm/test/CodeGen/ARM/swifttailcc-fastisel.ll new file mode 100644 index 0000000..7d6af2d --- /dev/null +++ b/llvm/test/CodeGen/ARM/swifttailcc-fastisel.ll @@ -0,0 +1,11 @@ +; RUN: llc -mtriple=thumbv7-apple-ios -O0 -fast-isel %s -o - | FileCheck %s + +declare swifttailcc i8* @SwiftSelf(i8 * swiftasync %context, i8* swiftself %closure) + +define swifttailcc i8* @CallSwiftSelf(i8* swiftself %closure, i8* %context) { +; CHECK-LABEL: CallSwiftSelf: +; CHECK: bl _SwiftSelf +; CHECK: pop {r7, pc} + %res = call swifttailcc i8* @SwiftSelf(i8 * swiftasync %context, i8* swiftself null) + ret i8* %res +} diff --git a/llvm/test/CodeGen/ARM/tailcc-call.ll b/llvm/test/CodeGen/ARM/tailcc-call.ll new file mode 100644 index 0000000..ced6f02 --- /dev/null +++ b/llvm/test/CodeGen/ARM/tailcc-call.ll @@ -0,0 +1,193 @@ +; RUN: llc -verify-machineinstrs < %s -mtriple=thumbv7k-apple-watchos | FileCheck %s + +declare tailcc void @callee_stack0() +declare tailcc void @callee_stack4([4 x i32], i32) +declare tailcc void @callee_stack20([4 x i32], [5 x i32]) +declare extern_weak tailcc void @callee_weak() + +define tailcc void @caller_to0_from0() nounwind { +; CHECK-LABEL: _caller_to0_from0: + + tail call tailcc void @callee_stack0() + ret void +; CHECK-NOT: add +; CHECK-NOT: sub +; CHECK: b.w _callee_stack0 +} + +define tailcc void @caller_to0_from4([4 x i32], i32) { +; CHECK-LABEL: _caller_to0_from4: + + tail call tailcc void @callee_stack0() + ret void + +; CHECK: add sp, #16 +; CHECK-NEXT: b.w _callee_stack0 +} + +define tailcc void @caller_to4_from0() { +; Key point is that the "42" should go #16 below incoming stack +; pointer (we didn't have arg space to reuse). + tail call tailcc void @callee_stack4([4 x i32] undef, i32 42) + ret void + +; CHECK-LABEL: _caller_to4_from0: +; CHECK: sub sp, #16 +; CHECK: movs [[TMP:r[0-9]+]], #42 +; CHECK: str [[TMP]], [sp] +; CHECK-NOT: add sp +; CHECK: b.w _callee_stack4 + +} + +define tailcc void @caller_to4_from4([4 x i32], i32 %a) { +; CHECK-LABEL: _caller_to4_from4: +; CHECK-NOT: sub sp +; Key point is that the "%a" should go where at SP on entry. + tail call tailcc void @callee_stack4([4 x i32] undef, i32 42) + ret void + +; CHECK: str {{r[0-9]+}}, [sp] +; CHECK-NOT: add sp +; CHECK-NEXT: b.w _callee_stack4 +} + +define tailcc void @caller_to20_from4([4 x i32], i32 %a) { +; CHECK-LABEL: _caller_to20_from4: +; CHECK: sub sp, #16 + +; Important point is that the call reuses the "dead" argument space +; above %a on the stack. If it tries to go below incoming-SP then the +; _callee will not deallocate the space, even in tailcc. + tail call tailcc void @callee_stack20([4 x i32] undef, [5 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5]) + +; CHECK: str {{.*}}, [sp] +; CHECK: str {{.*}}, [sp, #4] +; CHECK: str {{.*}}, [sp, #8] +; CHECK: str {{.*}}, [sp, #12] +; CHECK: str {{.*}}, [sp, #16] +; CHECK-NOT: add sp +; CHECK-NOT: sub sp +; CHECK: b.w _callee_stack20 + ret void +} + + +define tailcc void @caller_to4_from24([4 x i32], i64 %a, i64 %b, i64 %c) { +; CHECK-LABEL: _caller_to4_from24: + + +; Key point is that the "%a" should go where at #16 above SP on entry. + tail call tailcc void @callee_stack4([4 x i32] undef, i32 42) + ret void + +; CHECK: str {{.*}}, [sp, #16] +; CHECK: add sp, #16 +; CHECK-NEXT: b.w _callee_stack4 +} + + +define tailcc void @caller_to20_from20([4 x i32], [5 x i32] %a) { +; CHECK-LABEL: _caller_to20_from20: +; CHECK-NOT: add sp, +; CHECK-NOT: sub sp, + +; Here we want to make sure that both loads happen before the stores: +; otherwise either %a or %b.w will be wrongly clobbered. + tail call tailcc void @callee_stack20([4 x i32] undef, [5 x i32] %a) + ret void + + ; If these ever get interleaved make sure aliasing slots don't clobber each + ; other. +; CHECK: ldrd {{.*}}, {{.*}}, [sp, #12] +; CHECK: ldm.w sp, +; CHECK: stm.w +; CHECK: strd +; CHECK-NEXT: b.w _callee_stack20 +} + +define tailcc void @disable_tail_calls() nounwind "disable-tail-calls"="true" { +; CHECK-LABEL: disable_tail_calls: + + tail call tailcc void @callee_stack0() + ret void + +; CHECK: bl _callee_stack0 +; CHECK: ret +} + +define tailcc void @normal_ret_with_stack([4 x i32], i32 %a) { +; CHECK: _normal_ret_with_stack: +; CHECK: add sp, #16 +; CHECK: bx lr + ret void +} + +declare { [2 x float] } @get_vec2() + +define void @fromC_totail() { +; COMMON-LABEL: fromC_totail: +; COMMON: puch {r4, lr} +; COMMON: sub sp, #8 + +; COMMON-NOT: sub sp, +; COMMON: movs [[TMP:r[0-9]+]], #42 +; COMMON: str [[TMP]], [sp] +; COMMON: bl _callee_stack4 + ; We must reset the stack to where it was before the call by undoing its extra stack pop. +; COMMON: sub sp, #16 +; COMMON: str [[TMP]], [sp] +; COMMON: bl callee_stack4 +; COMMON: sub sp, #16 + + call tailcc void @callee_stack4([4 x i32] undef, i32 42) + call tailcc void @callee_stack4([4 x i32] undef, i32 42) + ret void +} + +define void @fromC_totail_noreservedframe(i32 %len) { +; COMMON-LABEL: fromC_totail_noreservedframe: +; COMMON: sub.w sp, sp, r{{.*}} + +; COMMON: movs [[TMP:r[0-9]+]], #42 + ; Note stack is subtracted here to allocate space for arg +; COMMON: sub.w sp, #16 +; COMMON: str [[TMP]], [sp] +; COMMON: bl _callee_stack4 + ; And here. +; COMMON: sub sp, #16 +; COMMON: str [[TMP]], [sp] +; COMMON: bl _callee_stack4 + ; But not restored here because callee_stack8 did that for us. +; COMMON-NOT: sub sp, + + ; Variable sized allocation prevents reserving frame at start of function so each call must allocate any stack space it needs. + %var = alloca i32, i32 %len + + call tailcc void @callee_stack4([4 x i32] undef, i32 42) + call tailcc void @callee_stack4([4 x i32] undef, i32 42) + ret void +} + +declare void @Ccallee_stack4([4 x i32], i32) + +define tailcc void @fromtail_toC() { +; COMMON-LABEL: fromtail_toC: +; COMMON: push {r4, lr} +; COMMON: sub sp, #8 + +; COMMON-NOT: sub sp, +; COMMON: movs [[TMP:r[0-9]+]], #42 +; COMMON: str [[TMP]], [sp] +; COMMON: bl _Ccallee_stack4 + ; C callees will return with the stack exactly where we left it, so we mustn't try to fix anything. +; COMMON-NOT: add sp, +; COMMON-NOT: sub sp, +; COMMON: str [[TMP]], [sp]{{$}} +; COMMON: bl _Ccallee_stack4 +; COMMON-NOT: sub sp, + + call void @Ccallee_stack4([4 x i32] undef, i32 42) + call void @Ccallee_stack4([4 x i32] undef, i32 42) + ret void +} diff --git a/llvm/test/CodeGen/ARM/v8m-tail-call.ll b/llvm/test/CodeGen/ARM/v8m-tail-call.ll index 7ee80d4..c683230 100644 --- a/llvm/test/CodeGen/ARM/v8m-tail-call.ll +++ b/llvm/test/CodeGen/ARM/v8m-tail-call.ll @@ -41,25 +41,30 @@ declare i32 @h2(i32, i32, i32, i32, i32) define hidden i32 @f2(i32, i32, i32, i32, i32) { ; CHECK-LABEL: f2: ; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: mov r4, r3 ; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: ldr r7, [sp, #24] ; CHECK-NEXT: bl g ; CHECK-NEXT: cbz r0, .LBB2_2 ; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: str r7, [sp, #24] ; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: mov r2, r5 ; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: ldr r4, [sp, #12] +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: ldr r4, [sp, #16] ; CHECK-NEXT: mov lr, r4 -; CHECK-NEXT: pop {r4, r5, r6} +; CHECK-NEXT: pop {r4, r5, r6, r7} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: b h2 ; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: mvns r0, r0 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} %6 = tail call i32 bitcast (i32 (...)* @g to i32 ()*)() %7 = icmp eq i32 %6, 0 br i1 %7, label %10, label %8