From: Sander de Smalen Date: Tue, 5 Nov 2019 16:54:54 +0000 (+0000) Subject: [AArch64][SVE] Spilling/filling of SVE callee-saves. X-Git-Tag: llvmorg-11-init~4575 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=84a0c8e3ae92829c4f04ba995b4b6283d397f65d;p=platform%2Fupstream%2Fllvm.git [AArch64][SVE] Spilling/filling of SVE callee-saves. Implement the spills/fills of callee-saved SVE registers using STR and LDR instructions. Also adds the `aarch64_sve_vector_pcs` attribute to specify the callee-saved registers to be used for functions that return SVE vectors or take SVE vectors as arguments. The callee-saved registers are vector registers z8-z23 and predicate registers p4-p15. The overal frame-layout with SVE will be as follows: +-------------+ | stack args | +-------------+ | Callee Saves| | X29, X30 | |-------------| <- FP | SVE Callee | < ////////////// | saved regs | < ////////////// | z23 | < ////////////// | : | < // SCALABLE // | z8 | < ////////////// | p15 | < /// STACK //// | : | < ////////////// | p4 | < //// AREA //// +-------------+ < ////////////// | : | < ////////////// | SVE locals | < ////////////// | : | < ////////////// +-------------+ |/////////////| alignment gap. | : | | Stack objs | | : | +-------------+ <- SP after call and frame-setup Reviewers: cameron.mcinally, efriedma, greened, thegameg, ostannard, rengolin Reviewed By: ostannard Differential Revision: https://reviews.llvm.org/D68996 --- diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 887a931..847ca04 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -594,6 +594,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(arm_aapcscc); KEYWORD(arm_aapcs_vfpcc); KEYWORD(aarch64_vector_pcs); + KEYWORD(aarch64_sve_vector_pcs); KEYWORD(msp430_intrcc); KEYWORD(avr_intrcc); KEYWORD(avr_signalcc); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index e9d2dfc..bb2c65f 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -1931,6 +1931,7 @@ void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) { /// ::= 'arm_aapcscc' /// ::= 'arm_aapcs_vfpcc' /// ::= 'aarch64_vector_pcs' +/// ::= 'aarch64_sve_vector_pcs' /// ::= 'msp430_intrcc' /// ::= 'avr_intrcc' /// ::= 'avr_signalcc' @@ -1977,6 +1978,9 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) { case lltok::kw_arm_aapcscc: CC = CallingConv::ARM_AAPCS; break; case lltok::kw_arm_aapcs_vfpcc:CC = CallingConv::ARM_AAPCS_VFP; break; case lltok::kw_aarch64_vector_pcs:CC = CallingConv::AArch64_VectorCall; break; + case lltok::kw_aarch64_sve_vector_pcs: + CC = CallingConv::AArch64_SVE_VectorCall; + break; case lltok::kw_msp430_intrcc: CC = CallingConv::MSP430_INTR; break; case lltok::kw_avr_intrcc: CC = CallingConv::AVR_INTR; break; case lltok::kw_avr_signalcc: CC = CallingConv::AVR_SIGNAL; break; diff --git a/llvm/lib/AsmParser/LLToken.h b/llvm/lib/AsmParser/LLToken.h index 9153b49..9029e15 100644 --- a/llvm/lib/AsmParser/LLToken.h +++ b/llvm/lib/AsmParser/LLToken.h @@ -142,6 +142,7 @@ enum Kind { kw_arm_aapcscc, kw_arm_aapcs_vfpcc, kw_aarch64_vector_pcs, + kw_aarch64_sve_vector_pcs, kw_msp430_intrcc, kw_avr_intrcc, kw_avr_signalcc, diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 107b32e..5ee0d52 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -364,6 +364,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) { case CallingConv::ARM_AAPCS: Out << "arm_aapcscc"; break; case CallingConv::ARM_AAPCS_VFP: Out << "arm_aapcs_vfpcc"; break; case CallingConv::AArch64_VectorCall: Out << "aarch64_vector_pcs"; break; + case CallingConv::AArch64_SVE_VectorCall: + Out << "aarch64_sve_vector_pcs"; + break; case CallingConv::MSP430_INTR: Out << "msp430_intrcc"; break; case CallingConv::AVR_INTR: Out << "avr_intrcc "; break; case CallingConv::AVR_SIGNAL: Out << "avr_signalcc "; break; diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 3c4121b..3c179ae 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -405,10 +405,10 @@ def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, // Functions taking SVE arguments or returning an SVE type // must (additionally) preserve full Z8-Z23 and predicate registers P4-P15 -def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, - X25, X26, X27, X28, LR, FP, - (sequence "Z%u", 8, 23), - (sequence "P%u", 4, 15))>; +def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add (sequence "Z%u", 8, 23), + (sequence "P%u", 4, 15), + X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, LR, FP)>; // Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since // 'this' and the pointer return value are both passed in X0 in these cases, @@ -486,5 +486,7 @@ def CSR_AArch64_RT_MostRegs_SCS : CalleeSavedRegs<(add CSR_AArch64_RT_MostRegs, X18)>; def CSR_AArch64_AAVPCS_SCS : CalleeSavedRegs<(add CSR_AArch64_AAVPCS, X18)>; +def CSR_AArch64_SVE_AAPCS_SCS + : CalleeSavedRegs<(add CSR_AArch64_SVE_AAPCS, X18)>; def CSR_AArch64_AAPCS_SCS : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X18)>; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index bbd7c51..afe4f14 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -834,6 +834,20 @@ static bool isTargetDarwin(const MachineFunction &MF) { return MF.getSubtarget().isTargetDarwin(); } +// Convenience function to determine whether I is an SVE callee save. +bool IsSVECalleeSave(MachineBasicBlock::iterator I) { + switch (I->getOpcode()) { + default: + return false; + case AArch64::STR_ZXI: + case AArch64::STR_PXI: + case AArch64::LDR_ZXI: + case AArch64::LDR_PXI: + return I->getFlag(MachineInstr::FrameSetup) || + I->getFlag(MachineInstr::FrameDestroy); + } +} + void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -965,7 +979,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // and pre-inc if we decided to combine the callee-save and local stack // pointer bump above. MachineBasicBlock::iterator End = MBB.end(); - while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) { + while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) && + !IsSVECalleeSave(MBBI)) { if (CombineSPBump) fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(), NeedsWinCFI, &HasWinCFI); @@ -1107,7 +1122,35 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, NumBytes = 0; } - emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -SVEStackSize, TII, + StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {}; + MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI; + + // Process the SVE callee-saves to determine what space needs to be + // allocated. + if (AFI->getSVECalleeSavedStackSize()) { + // Find callee save instructions in frame. + CalleeSavesBegin = MBBI; + assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); + while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator()) + ++MBBI; + CalleeSavesEnd = MBBI; + + int64_t OffsetToFirstCalleeSaveFromSP = + MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex()); + StackOffset OffsetToCalleeSavesFromSP = + StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize; + AllocateBefore -= OffsetToCalleeSavesFromSP; + AllocateAfter = SVEStackSize - AllocateBefore; + } + + // Allocate space for the callee saves (if any). + emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, + -AllocateBefore, TII, + MachineInstr::FrameSetup); + + // Finally allocate remaining SVE stack space. + emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP, + -AllocateAfter, TII, MachineInstr::FrameSetup); // Allocate space for the rest of the frame. @@ -1444,7 +1487,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock::iterator Begin = MBB.begin(); while (LastPopI != Begin) { --LastPopI; - if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) { + if (!LastPopI->getFlag(MachineInstr::FrameDestroy) || + IsSVECalleeSave(LastPopI)) { ++LastPopI; break; } else if (CombineSPBump) @@ -1476,11 +1520,53 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); + // Process the SVE callee-saves to determine what space needs to be + // deallocated. + StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; + MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI; + if (AFI->getSVECalleeSavedStackSize()) { + RestoreBegin = std::prev(RestoreEnd);; + while (IsSVECalleeSave(RestoreBegin) && + RestoreBegin != MBB.begin()) + --RestoreBegin; + ++RestoreBegin; + + assert(IsSVECalleeSave(RestoreBegin) && + IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction"); + + int64_t OffsetToFirstCalleeSaveFromSP = + MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex()); + StackOffset OffsetToCalleeSavesFromSP = + StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize; + DeallocateBefore = OffsetToCalleeSavesFromSP; + DeallocateAfter = SVEStackSize - DeallocateBefore; + } + // Deallocate the SVE area. - if (SVEStackSize) - if (!AFI->isStackRealigned()) - emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, SVEStackSize, - TII, MachineInstr::FrameDestroy); + if (SVEStackSize) { + if (AFI->isStackRealigned()) { + if (AFI->getSVECalleeSavedStackSize()) + // Set SP to start of SVE area, from which the callee-save reloads + // can be done. The code below will deallocate the stack space + // space by moving FP -> SP. + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP, + -SVEStackSize, TII, MachineInstr::FrameDestroy); + } else { + if (AFI->getSVECalleeSavedStackSize()) { + // Deallocate the non-SVE locals first before we can deallocate (and + // restore callee saves) from the SVE area. + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, + {NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy); + NumBytes = 0; + } + + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, + DeallocateBefore, TII, MachineInstr::FrameDestroy); + + emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, + DeallocateAfter, TII, MachineInstr::FrameDestroy); + } + } if (!hasFP(MF)) { bool RedZone = canUseRedZone(MF); @@ -1813,11 +1899,28 @@ struct RegPairInfo { unsigned Reg2 = AArch64::NoRegister; int FrameIdx; int Offset; - enum RegType { GPR, FPR64, FPR128 } Type; + enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type; RegPairInfo() = default; bool isPaired() const { return Reg2 != AArch64::NoRegister; } + + unsigned getScale() const { + switch (Type) { + case PPR: + return 2; + case GPR: + case FPR64: + return 8; + case ZPR: + case FPR128: + return 16; + default: + llvm_unreachable("Unsupported type"); + } + } + + bool isScalable() const { return Type == PPR || Type == ZPR; } }; } // end anonymous namespace @@ -1842,7 +1945,8 @@ static void computeCalleeSaveRegisterPairs( CC == CallingConv::PreserveMost || (Count & 1) == 0) && "Odd number of callee-saved regs to spill!"); - int Offset = AFI->getCalleeSavedStackSize(); + int ByteOffset = AFI->getCalleeSavedStackSize(); + int ScalableByteOffset = AFI->getSVECalleeSavedStackSize(); // On Linux, we will have either one or zero non-paired register. On Windows // with CFI, we can have multiple unpaired registers in order to utilize the // available unwind codes. This flag assures that the alignment fixup is done @@ -1858,6 +1962,10 @@ static void computeCalleeSaveRegisterPairs( RPI.Type = RegPairInfo::FPR64; else if (AArch64::FPR128RegClass.contains(RPI.Reg1)) RPI.Type = RegPairInfo::FPR128; + else if (AArch64::ZPRRegClass.contains(RPI.Reg1)) + RPI.Type = RegPairInfo::ZPR; + else if (AArch64::PPRRegClass.contains(RPI.Reg1)) + RPI.Type = RegPairInfo::PPR; else llvm_unreachable("Unsupported register class."); @@ -1880,6 +1988,9 @@ static void computeCalleeSaveRegisterPairs( if (AArch64::FPR128RegClass.contains(NextReg)) RPI.Reg2 = NextReg; break; + case RegPairInfo::PPR: + case RegPairInfo::ZPR: + break; } } @@ -1917,23 +2028,33 @@ static void computeCalleeSaveRegisterPairs( RPI.FrameIdx = CSI[i].getFrameIdx(); - int Scale = RPI.Type == RegPairInfo::FPR128 ? 16 : 8; - Offset -= RPI.isPaired() ? 2 * Scale : Scale; + int Scale = RPI.getScale(); + if (RPI.isScalable()) + ScalableByteOffset -= Scale; + else + ByteOffset -= RPI.isPaired() ? 2 * Scale : Scale; + + assert(!(RPI.isScalable() && RPI.isPaired()) && + "Paired spill/fill instructions don't exist for SVE vectors"); // Round up size of non-pair to pair size if we need to pad the // callee-save area to ensure 16-byte alignment. if (AFI->hasCalleeSaveStackFreeSpace() && !FixupDone && - RPI.Type != RegPairInfo::FPR128 && !RPI.isPaired()) { + !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 && + !RPI.isPaired()) { FixupDone = true; - Offset -= 8; - assert(Offset % 16 == 0); + ByteOffset -= 8; + assert(ByteOffset % 16 == 0); assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16); MFI.setObjectAlignment(RPI.FrameIdx, 16); } + int Offset = RPI.isScalable() ? ScalableByteOffset : ByteOffset; assert(Offset % Scale == 0); RPI.Offset = Offset / Scale; - assert((RPI.Offset >= -64 && RPI.Offset <= 63) && + + assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) || + (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) && "Offset out of bounds for LDP/STP immediate"); RegPairs.push_back(RPI); @@ -2025,6 +2146,16 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( Size = 16; Align = 16; break; + case RegPairInfo::ZPR: + StrOpc = AArch64::STR_ZXI; + Size = 16; + Align = 16; + break; + case RegPairInfo::PPR: + StrOpc = AArch64::STR_PXI; + Size = 2; + Align = 2; + break; } LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); @@ -2065,6 +2196,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameSetup); + // Update the StackIDs of the SVE stack slots. + MachineFrameInfo &MFI = MF.getFrameInfo(); + if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) + MFI.setStackID(RPI.FrameIdx, TargetStackID::SVEVector); + } return true; } @@ -2116,6 +2252,16 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( Size = 16; Align = 16; break; + case RegPairInfo::ZPR: + LdrOpc = AArch64::LDR_ZXI; + Size = 16; + Align = 16; + break; + case RegPairInfo::PPR: + LdrOpc = AArch64::LDR_PXI; + Size = 2; + Align = 2; + break; } LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI); if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); @@ -2150,12 +2296,20 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameDestroy); }; - if (ReverseCSRRestoreSeq) - for (const RegPairInfo &RPI : reverse(RegPairs)) + + // SVE objects are always restored in reverse order. + for (const RegPairInfo &RPI : reverse(RegPairs)) + if (RPI.isScalable()) EmitMI(RPI); - else + + if (ReverseCSRRestoreSeq) { + for (const RegPairInfo &RPI : reverse(RegPairs)) + if (!RPI.isScalable()) + EmitMI(RPI); + } else for (const RegPairInfo &RPI : RegPairs) - EmitMI(RPI); + if (!RPI.isScalable()) + EmitMI(RPI); if (NeedShadowCallStackProlog) { // Shadow call stack epilog: ldr x30, [x18, #-8]! @@ -2202,7 +2356,12 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(Reg); bool RegUsed = SavedRegs.test(Reg); - unsigned PairedReg = CSRegs[i ^ 1]; + unsigned PairedReg = AArch64::NoRegister; + if (AArch64::GPR64RegClass.contains(Reg) || + AArch64::FPR64RegClass.contains(Reg) || + AArch64::FPR128RegClass.contains(Reg)) + PairedReg = CSRegs[i ^ 1]; + if (!RegUsed) { if (AArch64::GPR64RegClass.contains(Reg) && !RegInfo->isReservedReg(MF, Reg)) { @@ -2226,10 +2385,17 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // Calculates the callee saved stack size. unsigned CSStackSize = 0; + unsigned SVECSStackSize = 0; const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const MachineRegisterInfo &MRI = MF.getRegInfo(); - for (unsigned Reg : SavedRegs.set_bits()) - CSStackSize += TRI->getRegSizeInBits(Reg, MRI) / 8; + for (unsigned Reg : SavedRegs.set_bits()) { + auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8; + if (AArch64::PPRRegClass.contains(Reg) || + AArch64::ZPRRegClass.contains(Reg)) + SVECSStackSize += RegSize; + else + CSStackSize += RegSize; + } // Save number of saved regs, so we can easily update CSStackSize later. unsigned NumSavedRegs = SavedRegs.count(); @@ -2249,10 +2415,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, dbgs() << "\n";); // If any callee-saved registers are used, the frame cannot be eliminated. - unsigned MaxAlign = getStackAlignment(); int64_t SVEStackSize = - alignTo(determineSVEStackSize(MFI, MaxAlign), MaxAlign); - assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes"); + alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16); bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize; // The CSR spill slots have not been allocated yet, so estimateStackSize @@ -2313,6 +2477,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // instructions. AFI->setCalleeSavedStackSize(AlignedCSStackSize); AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize); + AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16)); } bool AArch64FrameLowering::enableStackSlotScavenging( @@ -2321,9 +2486,39 @@ bool AArch64FrameLowering::enableStackSlotScavenging( return AFI->hasCalleeSaveStackFreeSpace(); } -int64_t AArch64FrameLowering::determineSVEStackSize(MachineFrameInfo &MFI, - unsigned &MaxAlign) const { - // Process all fixed stack objects. +/// returns true if there are any SVE callee saves. +static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI, + int &Min, int &Max) { + if (!MFI.isCalleeSavedInfoValid()) + return false; + + Min = std::numeric_limits::max(); + Max = std::numeric_limits::min(); + const std::vector &CSI = MFI.getCalleeSavedInfo(); + for (auto &CS : CSI) { + if (AArch64::ZPRRegClass.contains(CS.getReg()) || + AArch64::PPRRegClass.contains(CS.getReg())) { + assert((Max == std::numeric_limits::min() || + Max + 1 == CS.getFrameIdx()) && + "SVE CalleeSaves are not consecutive"); + + Min = std::min(Min, CS.getFrameIdx()); + Max = std::max(Max, CS.getFrameIdx()); + } + } + return Min != std::numeric_limits::max(); +} + +// Process all the SVE stack objects and determine offsets for each +// object. If AssignOffsets is true, the offsets get assigned. +// Fills in the first and last callee-saved frame indices into +// Min/MaxCSFrameIndex, respectively. +// Returns the size of the stack. +static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, + int &MinCSFrameIndex, + int &MaxCSFrameIndex, + bool AssignOffsets) { + // First process all fixed stack objects. int64_t Offset = 0; for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) if (MFI.getStackID(I) == TargetStackID::SVEVector) { @@ -2332,12 +2527,41 @@ int64_t AArch64FrameLowering::determineSVEStackSize(MachineFrameInfo &MFI, Offset = FixedOffset; } + // Then process all callee saved slots. + if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) { + // Make sure to align the last callee save slot. + MFI.setObjectAlignment(MaxCSFrameIndex, 16U); + + // Assign offsets to the callee save slots. + for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) { + Offset += MFI.getObjectSize(I); + Offset = alignTo(Offset, MFI.getObjectAlignment(I)); + if (AssignOffsets) { + LLVM_DEBUG(dbgs() << "alloc FI(" << I << ") at SP[" << Offset + << "]\n"); + MFI.setObjectOffset(I, -Offset); + } + } + } + // Note: We don't take allocatable stack objects into // account yet, because allocation for those is not yet // implemented. return Offset; } +int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets( + MachineFrameInfo &MFI) const { + int MinCSFrameIndex, MaxCSFrameIndex; + return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false); +} + +int64_t AArch64FrameLowering::assignSVEStackObjectOffsets( + MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const { + return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, + true); +} + void AArch64FrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -2345,12 +2569,13 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && "Upwards growing stack unsupported"); - unsigned MaxAlign = getStackAlignment(); - int64_t SVEStackSize = determineSVEStackSize(MFI, MaxAlign); + int MinCSFrameIndex, MaxCSFrameIndex; + int64_t SVEStackSize = + assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex); AArch64FunctionInfo *AFI = MF.getInfo(); - AFI->setStackSizeSVE(alignTo(SVEStackSize, MaxAlign)); - assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes"); + AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U)); + AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex); // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index ac150e8..f84847d 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -101,7 +101,11 @@ public: private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, unsigned StackBumpBytes) const; - int64_t determineSVEStackSize(MachineFrameInfo &MF, unsigned &MaxAlign) const; + + int64_t estimateSVEStackObjectOffsets(MachineFrameInfo &MF) const; + int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF, + int &MinCSFrameIndex, + int &MaxCSFrameIndex) const; }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 9705a1b..2a3b3a3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3118,6 +3118,9 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, switch (CC) { default: report_fatal_error("Unsupported calling convention."); + case CallingConv::AArch64_SVE_VectorCall: + // Calling SVE functions is currently not yet supported. + report_fatal_error("Unsupported calling convention."); case CallingConv::WebKit_JS: return CC_AArch64_WebKit_JS; case CallingConv::GHC: diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 3266186..dc9ca27 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -53,8 +53,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// Amount of stack frame size, not including callee-saved registers. unsigned LocalStackSize; + /// The start and end frame indices for the SVE callee saves. + int MinSVECSFrameIndex; + int MaxSVECSFrameIndex; + /// Amount of stack frame size used for saving callee-saved registers. unsigned CalleeSavedStackSize; + unsigned SVECalleeSavedStackSize; bool HasCalleeSavedStackSize = false; /// Number of TLS accesses using the special (combinable) @@ -161,7 +166,6 @@ public: void setCalleeSaveStackHasFreeSpace(bool s) { CalleeSaveStackHasFreeSpace = s; } - bool isSplitCSR() const { return IsSplitCSR; } void setIsSplitCSR(bool s) { IsSplitCSR = s; } @@ -218,6 +222,22 @@ public: return CalleeSavedStackSize; } + // Saves the CalleeSavedStackSize for SVE vectors in 'scalable bytes' + void setSVECalleeSavedStackSize(unsigned Size) { + SVECalleeSavedStackSize = Size; + } + unsigned getSVECalleeSavedStackSize() const { + return SVECalleeSavedStackSize; + } + + void setMinMaxSVECSFrameIndex(int Min, int Max) { + MinSVECSFrameIndex = Min; + MaxSVECSFrameIndex = Max; + } + + int getMinSVECSFrameIndex() const { return MinSVECSFrameIndex; } + int getMaxSVECSFrameIndex() const { return MaxSVECSFrameIndex; } + void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; } unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamicTLSAccesses; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 918e89f..47ccef5e 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -55,6 +55,8 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_AArch64_AllRegs_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall) return CSR_AArch64_AAVPCS_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall) + return CSR_AArch64_SVE_AAPCS_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS) return MF->getInfo()->isSplitCSR() ? CSR_AArch64_CXX_TLS_Darwin_PE_SaveList : @@ -125,7 +127,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, if (CC == CallingConv::AArch64_VectorCall) return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask; if (CC == CallingConv::AArch64_SVE_VectorCall) - return CSR_AArch64_SVE_AAPCS_RegMask; + return SCS ? CSR_AArch64_SVE_AAPCS_SCS_RegMask + : CSR_AArch64_SVE_AAPCS_RegMask; if (CC == CallingConv::CFGuard_Check) return CSR_Win_AArch64_CFGuard_Check_RegMask; if (MF.getSubtarget().getTargetLowering() diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir index 452ae94..18d6796 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir @@ -30,6 +30,10 @@ define void @test_address_sve_fp() nounwind { entry: unreachable } define void @test_stack_arg_sve() nounwind { entry: unreachable } define void @test_address_sve_out_of_range() nounwind { entry: unreachable } + define aarch64_sve_vector_pcs void @save_restore_pregs_sve() nounwind { entry: unreachable } + define aarch64_sve_vector_pcs void @save_restore_zregs_sve() nounwind { entry: unreachable } + define aarch64_sve_vector_pcs void @save_restore_sve() nounwind { entry: unreachable } + define aarch64_sve_vector_pcs void @save_restore_sve_realign() nounwind { entry: unreachable } ... # +----------+ @@ -328,3 +332,183 @@ body: | RET_ReallyLR --- +... +# CHECK-LABEL: name: save_restore_pregs_sve +# CHECK: $sp = frame-setup ADDVL_XXI $sp, -1 +# CHECK: frame-setup STR_PXI killed $p6, $sp, 5 +# CHECK: frame-setup STR_PXI killed $p5, $sp, 6 +# CHECK: frame-setup STR_PXI killed $p4, $sp, 7 +# CHECK: $sp = frame-setup SUBXri $sp, 32, 0 + +# CHECK: $sp = frame-destroy ADDXri $sp, 32, 0 +# CHECK: $p6 = frame-destroy LDR_PXI $sp, 5 +# CHECK: $p5 = frame-destroy LDR_PXI $sp, 6 +# CHECK: $p4 = frame-destroy LDR_PXI $sp, 7 +# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 1 +# CHECK: RET_ReallyLR +name: save_restore_pregs_sve +stack: + - { id: 0, stack-id: default, size: 32, alignment: 16 } +body: | + bb.0.entry: + + $p4 = IMPLICIT_DEF + $p5 = IMPLICIT_DEF + $p6 = IMPLICIT_DEF + + RET_ReallyLR +--- +... +# CHECK-LABEL: name: save_restore_zregs_sve +# CHECK: $sp = frame-setup ADDVL_XXI $sp, -3 +# CHECK: frame-setup STR_ZXI killed $z10, $sp, 0 +# CHECK: frame-setup STR_ZXI killed $z9, $sp, 1 +# CHECK: frame-setup STR_ZXI killed $z8, $sp, 2 +# CHECK: $sp = frame-setup SUBXri $sp, 32, 0 + +# CHECK: $sp = frame-destroy ADDXri $sp, 32, 0 +# CHECK: $z10 = frame-destroy LDR_ZXI $sp, 0 +# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 1 +# CHECK: $z8 = frame-destroy LDR_ZXI $sp, 2 +# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 3 +# CHECK: RET_ReallyLR +name: save_restore_zregs_sve +stack: + - { id: 0, stack-id: default, size: 32, alignment: 16 } +body: | + bb.0.entry: + + $z8 = IMPLICIT_DEF + $z9 = IMPLICIT_DEF + $z10 = IMPLICIT_DEF + + RET_ReallyLR +--- +... +# Test allocation/deallocation of the stack frame together with the +# saving/restoring of callee save registers. Fixed-stack objects +# are allocated before the callee-saves. +# This also adds some non-SVE callee-saves, to ensure that those are +# paired correctly. +# +# CHECK-LABEL: name: save_restore_sve +# CHECK: $sp = frame-setup STPXpre killed ${{[a-z0-9]+}}, killed $x21, $sp, -4 +# CHECK: frame-setup STPXi killed $x20, killed $x19, $sp, 2 +# CHECK: $sp = frame-setup ADDVL_XXI $sp, -19 +# CHECK: frame-setup STR_PXI killed $p15, $sp, 4 +# CHECK: frame-setup STR_PXI killed $p14, $sp, 5 +# CHECK: frame-setup STR_PXI killed $p5, $sp, 14 +# CHECK: frame-setup STR_PXI killed $p4, $sp, 15 +# CHECK: frame-setup STR_ZXI killed $z23, $sp, 2 +# CHECK: frame-setup STR_ZXI killed $z22, $sp, 3 +# CHECK: frame-setup STR_ZXI killed $z9, $sp, 16 +# CHECK: frame-setup STR_ZXI killed $z8, $sp, 17 +# CHECK: $sp = frame-setup SUBXri $sp, 32, 0 + +# CHECK: $sp = frame-destroy ADDXri $sp, 32, 0 +# CHECK: $p15 = frame-destroy LDR_PXI $sp, 4 +# CHECK: $p14 = frame-destroy LDR_PXI $sp, 5 +# CHECK: $p5 = frame-destroy LDR_PXI $sp, 14 +# CHECK: $p4 = frame-destroy LDR_PXI $sp, 15 +# CHECK: $z23 = frame-destroy LDR_ZXI $sp, 2 +# CHECK: $z22 = frame-destroy LDR_ZXI $sp, 3 +# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16 +# CHECK: $z8 = frame-destroy LDR_ZXI $sp, 17 +# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 19 +# CHECK: $x20, $x19 = frame-destroy LDPXi $sp, 2 +# CHECK: $sp, ${{[a-z0-9]+}}, $x21 = frame-destroy LDPXpost $sp, 4 +# CHECK: RET_ReallyLR +name: save_restore_sve +fixedStack: + - { id: 0, stack-id: sve-vec, size: 16, alignment: 16, offset: -16 } +stack: + - { id: 0, stack-id: default, size: 32, alignment: 16 } +body: | + bb.0.entry: + + $z8_z9_z10_z11 = IMPLICIT_DEF + $z12_z13_z14_z15 = IMPLICIT_DEF + $z16_z17_z18_z19 = IMPLICIT_DEF + $z20_z21_z22_z23 = IMPLICIT_DEF + $z24_z25_z26_z27 = IMPLICIT_DEF + $z28_z29_z30_z31 = IMPLICIT_DEF + $p4 = IMPLICIT_DEF + $p5 = IMPLICIT_DEF + $p6 = IMPLICIT_DEF + $p7 = IMPLICIT_DEF + $p8 = IMPLICIT_DEF + $p9 = IMPLICIT_DEF + $p10 = IMPLICIT_DEF + $p11 = IMPLICIT_DEF + $p12 = IMPLICIT_DEF + $p13 = IMPLICIT_DEF + $p14 = IMPLICIT_DEF + $p15 = IMPLICIT_DEF + + $x19 = IMPLICIT_DEF + $x20 = IMPLICIT_DEF + $x21 = IMPLICIT_DEF + + RET_ReallyLR +--- +... +# Test allocation/deallocation of the stack frame together with the +# saving/restoring of callee save registers. Fixed-stack objects +# are allocated before the callee-saves. +# +# CHECK-LABEL: name: save_restore_sve_realign +# CHECK: $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 +# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -19 +# CHECK-NEXT: STR_PXI killed $p15, $sp, 4 +# CHECK-NEXT: STR_PXI killed $p14, $sp, 5 +# CHECK: STR_PXI killed $p5, $sp, 14 +# CHECK-NEXT: STR_PXI killed $p4, $sp, 15 +# CHECK-NEXT: STR_ZXI killed $z23, $sp, 2 +# CHECK-NEXT: STR_ZXI killed $z22, $sp, 3 +# CHECK: STR_ZXI killed $z9, $sp, 16 +# CHECK-NEXT: STR_ZXI killed $z8, $sp, 17 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: $sp = ANDXri killed $[[TMP]] + +# CHECK: $sp = frame-destroy ADDVL_XXI $fp, -19 +# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4 +# CHECK-NEXT: $p14 = frame-destroy LDR_PXI $sp, 5 +# CHECK: $p5 = frame-destroy LDR_PXI $sp, 14 +# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 15 +# CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 2 +# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3 +# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16 +# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17 +# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 +# CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 +# CHECK-NEXT: RET_ReallyLR +name: save_restore_sve_realign +fixedStack: + - { id: 0, stack-id: sve-vec, size: 16, alignment: 16, offset: -16 } +stack: + - { id: 0, stack-id: default, size: 16, alignment: 32 } +body: | + bb.0.entry: + + $z8_z9_z10_z11 = IMPLICIT_DEF + $z12_z13_z14_z15 = IMPLICIT_DEF + $z16_z17_z18_z19 = IMPLICIT_DEF + $z20_z21_z22_z23 = IMPLICIT_DEF + $z24_z25_z26_z27 = IMPLICIT_DEF + $z28_z29_z30_z31 = IMPLICIT_DEF + $p4 = IMPLICIT_DEF + $p5 = IMPLICIT_DEF + $p6 = IMPLICIT_DEF + $p7 = IMPLICIT_DEF + $p8 = IMPLICIT_DEF + $p9 = IMPLICIT_DEF + $p10 = IMPLICIT_DEF + $p11 = IMPLICIT_DEF + $p12 = IMPLICIT_DEF + $p13 = IMPLICIT_DEF + $p14 = IMPLICIT_DEF + $p15 = IMPLICIT_DEF + + RET_ReallyLR +---