From a63a5b993ee90e7da6afc052c6702709f39dd9eb Mon Sep 17 00:00:00 2001 From: Martin Storsjo Date: Sat, 17 Feb 2018 14:26:32 +0000 Subject: [PATCH] [AArch64] Implement dynamic stack probing for windows This makes sure that alloca() function calls properly probe the stack as needed. Differential Revision: https://reviews.llvm.org/D42356 llvm-svn: 325433 --- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 4 ++ .../lib/Target/AArch64/AArch64CallingConvention.td | 4 ++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 69 +++++++++++++++++++++- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 4 ++ llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp | 4 ++ llvm/lib/Target/AArch64/AArch64RegisterInfo.h | 3 + llvm/test/CodeGen/AArch64/win-alloca.ll | 23 ++++++++ llvm/test/CodeGen/AArch64/win64_vararg.ll | 7 ++- 8 files changed, 114 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/win-alloca.ll diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index bafca24..4f6b41d 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1023,6 +1023,10 @@ bool IRTranslator::translateAlloca(const User &U, return true; } + // FIXME: support stack probing for Windows. + if (MF->getTarget().getTargetTriple().isOSWindows()) + return false; + // Now we're in the harder dynamic case. Type *Ty = AI.getAllocatedType(); unsigned Align = diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 93a6844..e576731 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -345,3 +345,7 @@ def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>; def CSR_AArch64_RT_MostRegs : CalleeSavedRegs<(add CSR_AArch64_AAPCS, (sequence "X%u", 9, 15))>; +def CSR_AArch64_StackProbe_Windows + : CalleeSavedRegs<(add (sequence "X%u", 0, 15), + (sequence "X%u", 18, 28), FP, SP, + (sequence "Q%u", 0, 31))>; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index fe21610..9330d7c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -253,7 +253,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Variable-sized objects. setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); + + if (Subtarget->isTargetWindows()) + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); + else + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); // Constant pool entries setOperationAction(ISD::ConstantPool, MVT::i64, Custom); @@ -2687,6 +2691,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerATOMIC_LOAD_SUB(Op, DAG); case ISD::ATOMIC_LOAD_AND: return LowerATOMIC_LOAD_AND(Op, DAG); + case ISD::DYNAMIC_STACKALLOC: + return LowerDYNAMIC_STACKALLOC(Op, DAG); } } @@ -7415,6 +7421,67 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, AN->getMemOperand()); } +SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( + SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0); + + const uint32_t *Mask = + Subtarget->getRegisterInfo()->getWindowsStackProbePreservedMask(); + + Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size, + DAG.getConstant(4, dl, MVT::i64)); + Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue()); + Chain = + DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue), + Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64), + DAG.getRegisterMask(Mask), Chain.getValue(1)); + // To match the actual intent better, we should read the output from X15 here + // again (instead of potentially spilling it to the stack), but rereading Size + // from X15 here doesn't work at -O0, since it thinks that X15 is undefined + // here. + + Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size, + DAG.getConstant(4, dl, MVT::i64)); + return Chain; +} + +SDValue +AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget->isTargetWindows() && + "Only Windows alloca probing supported"); + SDLoc dl(Op); + // Get the inputs. + SDNode *Node = Op.getNode(); + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + unsigned Align = cast(Op.getOperand(2))->getZExtValue(); + EVT VT = Node->getValueType(0); + + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); + + Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG); + + SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); + Chain = SP.getValue(1); + SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); + Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); + + if (Align) { + SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), + DAG.getConstant(-(uint64_t)Align, dl, VT)); + Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); + } + + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), + DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); + + SDValue Ops[2] = {SP, Chain}; + return DAG.getMergeValues(Ops, dl); +} + /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 2003196..23bf4fe 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -598,6 +598,10 @@ private: SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain, + SDValue &Size, + SelectionDAG &DAG) const; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector *Created) const override; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 360b391..933cd81 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -114,6 +114,10 @@ AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF, return CSR_AArch64_AAPCS_ThisReturn_RegMask; } +const uint32_t *AArch64RegisterInfo::getWindowsStackProbePreservedMask() const { + return CSR_AArch64_StackProbe_Windows_RegMask; +} + BitVector AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { const AArch64FrameLowering *TFI = getFrameLowering(MF); diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index 37d65e6..799d8c8 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -61,6 +61,9 @@ public: const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const; + /// Stack probing calls preserve different CSRs to the normal CC. + const uint32_t *getWindowsStackProbePreservedMask() const; + BitVector getReservedRegs(const MachineFunction &MF) const override; bool isConstantPhysReg(unsigned PhysReg) const override; const TargetRegisterClass * diff --git a/llvm/test/CodeGen/AArch64/win-alloca.ll b/llvm/test/CodeGen/AArch64/win-alloca.ll new file mode 100644 index 0000000..04467b9 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/win-alloca.ll @@ -0,0 +1,23 @@ +; RUN: llc -mtriple aarch64-windows -verify-machineinstrs -filetype asm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-OPT +; RUN: llc -mtriple aarch64-windows -verify-machineinstrs -filetype asm -o - %s -O0 | FileCheck %s + +define void @func(i64 %a) { +entry: + %0 = alloca i8, i64 %a, align 16 + call void @func2(i8* nonnull %0) + ret void +} + +declare void @func2(i8*) + +; The -O0 version here ends up much less elegant, so just check the +; details of the optimized form, but check that -O0 at least emits the +; call to __chkstk. + +; CHECK: add [[REG1:x[0-9]+]], x0, #15 +; CHECK-OPT: lsr x15, [[REG1]], #4 +; CHECK: bl __chkstk +; CHECK: mov [[REG2:x[0-9]+]], sp +; CHECK-OPT: sub [[REG3:x[0-9]+]], [[REG2]], x15, lsl #4 +; CHECK-OPT: mov sp, [[REG3:x[0-9]+]] +; CHECK: bl func2 diff --git a/llvm/test/CodeGen/AArch64/win64_vararg.ll b/llvm/test/CodeGen/AArch64/win64_vararg.ll index 6b20275..9cc9f50ad 100644 --- a/llvm/test/CodeGen/AArch64/win64_vararg.ll +++ b/llvm/test/CodeGen/AArch64/win64_vararg.ll @@ -159,14 +159,15 @@ attributes #6 = { "no-frame-pointer-elim"="true" } ; CHECK: stur x8, [x29, #-40] ; CHECK: mov w8, w0 ; CHECK: add x8, x8, #15 -; CHECK: mov x9, sp -; CHECK: and x8, x8, #0x1fffffff0 -; CHECK: sub [[REG:x[0-9]+]], x9, x8 +; CHECK: lsr x15, x8, #4 ; CHECK: mov x19, x1 ; CHECK: mov [[REG2:x[0-9]+]], sp ; CHECK: stp x6, x7, [x29, #48] ; CHECK: stp x4, x5, [x29, #32] ; CHECK: stp x2, x3, [x29, #16] +; CHECK: bl __chkstk +; CHECK: mov x8, sp +; CHECK: sub [[REG:x[0-9]+]], x8, x15, lsl #4 ; CHECK: mov sp, [[REG]] ; CHECK: ldur [[REG3:x[0-9]+]], [x29, #-40] ; CHECK: sxtw [[REG4:x[0-9]+]], w0 -- 2.7.4