From a63a5b993ee90e7da6afc052c6702709f39dd9eb Mon Sep 17 00:00:00 2001
From: Martin Storsjo <martin@martin.st>
Date: Sat, 17 Feb 2018 14:26:32 +0000
Subject: [PATCH] [AArch64] Implement dynamic stack probing for windows

This makes sure that alloca() function calls properly probe the
stack as needed.

Differential Revision: https://reviews.llvm.org/D42356

llvm-svn: 325433
---
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp       |  4 ++
 .../lib/Target/AArch64/AArch64CallingConvention.td |  4 ++
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp    | 69 +++++++++++++++++++++-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h      |  4 ++
 llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp    |  4 ++
 llvm/lib/Target/AArch64/AArch64RegisterInfo.h      |  3 +
 llvm/test/CodeGen/AArch64/win-alloca.ll            | 23 ++++++++
 llvm/test/CodeGen/AArch64/win64_vararg.ll          |  7 ++-
 8 files changed, 114 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/win-alloca.ll

diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index bafca24..4f6b41d 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1023,6 +1023,10 @@ bool IRTranslator::translateAlloca(const User &U,
     return true;
   }
 
+  // FIXME: support stack probing for Windows.
+  if (MF->getTarget().getTargetTriple().isOSWindows())
+    return false;
+
   // Now we're in the harder dynamic case.
   Type *Ty = AI.getAllocatedType();
   unsigned Align =
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 93a6844..e576731 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -345,3 +345,7 @@ def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>;
 def CSR_AArch64_RT_MostRegs :  CalleeSavedRegs<(add CSR_AArch64_AAPCS,
                                                 (sequence "X%u", 9, 15))>;
 
+def CSR_AArch64_StackProbe_Windows
+    : CalleeSavedRegs<(add (sequence "X%u", 0, 15),
+                           (sequence "X%u", 18, 28), FP, SP,
+                           (sequence "Q%u", 0, 31))>;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index fe21610..9330d7c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -253,7 +253,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   // Variable-sized objects.
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+
+  if (Subtarget->isTargetWindows())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
+  else
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
 
   // Constant pool entries
   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
@@ -2687,6 +2691,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerATOMIC_LOAD_SUB(Op, DAG);
   case ISD::ATOMIC_LOAD_AND:
     return LowerATOMIC_LOAD_AND(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC:
+    return LowerDYNAMIC_STACKALLOC(Op, DAG);
   }
 }
 
@@ -7415,6 +7421,67 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
                        AN->getMemOperand());
 }
 
+SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
+    SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);
+
+  const uint32_t *Mask =
+      Subtarget->getRegisterInfo()->getWindowsStackProbePreservedMask();
+
+  Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
+                     DAG.getConstant(4, dl, MVT::i64));
+  Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
+  Chain =
+      DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
+                  Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
+                  DAG.getRegisterMask(Mask), Chain.getValue(1));
+  // To match the actual intent better, we should read the output from X15 here
+  // again (instead of potentially spilling it to the stack), but rereading Size
+  // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
+  // here.
+
+  Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
+                     DAG.getConstant(4, dl, MVT::i64));
+  return Chain;
+}
+
+SDValue
+AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWindows() &&
+         "Only Windows alloca probing supported");
+  SDLoc dl(Op);
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  EVT VT = Node->getValueType(0);
+
+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
+
+  Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
+
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
+  Chain = SP.getValue(1);
+  SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
+  Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
+
+  if (Align) {
+    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                     DAG.getConstant(-(uint64_t)Align, dl, VT));
+    Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
+  }
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+                             DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+  SDValue Ops[2] = {SP, Chain};
+  return DAG.getMergeValues(Ops, dl);
+}
+
 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
 /// specified in the intrinsic calls.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2003196..23bf4fe 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -598,6 +598,10 @@ private:
   SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
+                                         SDValue &Size,
+                                         SelectionDAG &DAG) const;
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         std::vector<SDNode *> *Created) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 360b391..933cd81 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -114,6 +114,10 @@ AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
   return CSR_AArch64_AAPCS_ThisReturn_RegMask;
 }
 
+const uint32_t *AArch64RegisterInfo::getWindowsStackProbePreservedMask() const {
+  return CSR_AArch64_StackProbe_Windows_RegMask;
+}
+
 BitVector
 AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   const AArch64FrameLowering *TFI = getFrameLowering(MF);
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 37d65e6..799d8c8 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -61,6 +61,9 @@ public:
   const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF,
                                              CallingConv::ID) const;
 
+  /// Stack probing calls preserve different CSRs to the normal CC.
+  const uint32_t *getWindowsStackProbePreservedMask() const;
+
   BitVector getReservedRegs(const MachineFunction &MF) const override;
   bool isConstantPhysReg(unsigned PhysReg) const override;
   const TargetRegisterClass *
diff --git a/llvm/test/CodeGen/AArch64/win-alloca.ll b/llvm/test/CodeGen/AArch64/win-alloca.ll
new file mode 100644
index 0000000..04467b9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/win-alloca.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple aarch64-windows -verify-machineinstrs -filetype asm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-OPT
+; RUN: llc -mtriple aarch64-windows -verify-machineinstrs -filetype asm -o - %s -O0 | FileCheck %s
+
+define void @func(i64 %a) {
+entry:
+  %0 = alloca i8, i64 %a, align 16
+  call void @func2(i8* nonnull %0)
+  ret void
+}
+
+declare void @func2(i8*)
+
+; The -O0 version here ends up much less elegant, so just check the
+; details of the optimized form, but check that -O0 at least emits the
+; call to __chkstk.
+
+; CHECK: add [[REG1:x[0-9]+]], x0, #15
+; CHECK-OPT: lsr x15, [[REG1]], #4
+; CHECK: bl __chkstk
+; CHECK: mov [[REG2:x[0-9]+]], sp
+; CHECK-OPT: sub [[REG3:x[0-9]+]], [[REG2]], x15, lsl #4
+; CHECK-OPT: mov sp, [[REG3:x[0-9]+]]
+; CHECK: bl func2
diff --git a/llvm/test/CodeGen/AArch64/win64_vararg.ll b/llvm/test/CodeGen/AArch64/win64_vararg.ll
index 6b20275..9cc9f50ad 100644
--- a/llvm/test/CodeGen/AArch64/win64_vararg.ll
+++ b/llvm/test/CodeGen/AArch64/win64_vararg.ll
@@ -159,14 +159,15 @@ attributes #6 = { "no-frame-pointer-elim"="true" }
 ; CHECK: stur    x8, [x29, #-40]
 ; CHECK: mov     w8, w0
 ; CHECK: add     x8, x8, #15
-; CHECK: mov     x9, sp
-; CHECK: and     x8, x8, #0x1fffffff0
-; CHECK: sub     [[REG:x[0-9]+]], x9, x8
+; CHECK: lsr     x15, x8, #4
 ; CHECK: mov     x19, x1
 ; CHECK: mov     [[REG2:x[0-9]+]], sp
 ; CHECK: stp     x6, x7, [x29, #48]
 ; CHECK: stp     x4, x5, [x29, #32]
 ; CHECK: stp     x2, x3, [x29, #16]
+; CHECK: bl      __chkstk
+; CHECK: mov     x8, sp
+; CHECK: sub     [[REG:x[0-9]+]], x8, x15, lsl #4
 ; CHECK: mov     sp, [[REG]]
 ; CHECK: ldur    [[REG3:x[0-9]+]], [x29, #-40]
 ; CHECK: sxtw    [[REG4:x[0-9]+]], w0
-- 
2.7.4