From 3225fcf11eb7741d4f0b7e891685d7133aae5c7b Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Thu, 22 Oct 2020 09:37:33 +0100
Subject: [PATCH] [SVE] Deal with SVE tuple call arguments correctly when
 running out of registers

When passing SVE types as arguments to function calls we can run
out of hardware SVE registers. This is normally fine, since we
switch to an indirect mode where we pass a pointer to a SVE stack
object in a GPR. However, if we switch over part-way through
processing a SVE tuple then part of it will be in registers and
the other part will be on the stack.

I've fixed this by ensuring that:

1. When we don't have enough registers to allocate the whole block
   we mark any remaining SVE registers temporarily as allocated.
2. We temporarily remove the InConsecutiveRegs flags from the last
   tuple part argument and reinvoke the autogenerated calling
   convention handler. Doing this prevents the code from entering
   an infinite recursion and, in combination with 1), ensures we
   switch over to the Indirect mode.
3. After allocating a GPR register for the pointer to the tuple we
   then deallocate any SVE registers we marked as allocated in 1).
   We also set the InConsecutiveRegs flags back how they were before.
4. I've changed the AArch64ISelLowering LowerCALL and
   LowerFormalArguments functions to detect the start of a tuple,
   which involves allocating a single stack object and doing the
   correct numbers of legal loads and stores.

Differential Revision: https://reviews.llvm.org/D90219
---
 llvm/include/llvm/CodeGen/CallingConvLower.h       |   7 +
 llvm/include/llvm/CodeGen/TargetCallingConv.h      |   6 +-
 llvm/lib/CodeGen/CallingConvLower.cpp              |   5 +
 .../Target/AArch64/AArch64CallingConvention.cpp    |  57 +++++-
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp    | 100 ++++++++--
 .../AArch64/sve-calling-convention-mixed.ll        | 201 +++++++++++++++++++++
 .../sve-calling-convention-tuples-broken.ll        |  23 ---
 7 files changed, 347 insertions(+), 52 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll

diff --git a/llvm/include/llvm/CodeGen/CallingConvLower.h b/llvm/include/llvm/CodeGen/CallingConvLower.h
index 52c88d8..2fe4e37 100644
--- a/llvm/include/llvm/CodeGen/CallingConvLower.h
+++ b/llvm/include/llvm/CodeGen/CallingConvLower.h
@@ -340,6 +340,11 @@ public:
     return Regs.size();
   }
 
+  void DeallocateReg(MCPhysReg Reg) {
+    assert(isAllocated(Reg) && "Trying to deallocate an unallocated register");
+    MarkUnallocated(Reg);
+  }
+
   /// AllocateReg - Attempt to allocate one register.  If it is not available,
   /// return zero.  Otherwise, return the register, marking it and any aliases
   /// as allocated.
@@ -570,6 +575,8 @@ public:
 private:
   /// MarkAllocated - Mark a register and all of its aliases as allocated.
   void MarkAllocated(MCPhysReg Reg);
+
+  void MarkUnallocated(MCPhysReg Reg);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/CodeGen/TargetCallingConv.h b/llvm/include/llvm/CodeGen/TargetCallingConv.h
index 7baf5b2..df974b4 100644
--- a/llvm/include/llvm/CodeGen/TargetCallingConv.h
+++ b/llvm/include/llvm/CodeGen/TargetCallingConv.h
@@ -122,10 +122,12 @@ namespace ISD {
     void setReturned() { IsReturned = 1; }
 
     bool isInConsecutiveRegs()  const { return IsInConsecutiveRegs; }
-    void setInConsecutiveRegs() { IsInConsecutiveRegs = 1; }
+    void setInConsecutiveRegs(bool Flag = true) { IsInConsecutiveRegs = Flag; }
 
     bool isInConsecutiveRegsLast() const { return IsInConsecutiveRegsLast; }
-    void setInConsecutiveRegsLast() { IsInConsecutiveRegsLast = 1; }
+    void setInConsecutiveRegsLast(bool Flag = true) {
+      IsInConsecutiveRegsLast = Flag;
+    }
 
     bool isSplit()   const { return IsSplit; }
     void setSplit()  { IsSplit = 1; }
diff --git a/llvm/lib/CodeGen/CallingConvLower.cpp b/llvm/lib/CodeGen/CallingConvLower.cpp
index 9afaf95..c9246f6 100644
--- a/llvm/lib/CodeGen/CallingConvLower.cpp
+++ b/llvm/lib/CodeGen/CallingConvLower.cpp
@@ -63,6 +63,11 @@ void CCState::MarkAllocated(MCPhysReg Reg) {
     UsedRegs[*AI / 32] |= 1 << (*AI & 31);
 }
 
+void CCState::MarkUnallocated(MCPhysReg Reg) {
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    UsedRegs[*AI / 32] &= ~(1 << (*AI & 31));
+}
+
 bool CCState::IsShadowAllocatedReg(MCRegister Reg) const {
   if (!isAllocated(Reg))
     return false;
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index 9ae2b46..c51dd48 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -42,6 +42,51 @@ static const MCPhysReg ZRegList[] = {AArch64::Z0, AArch64::Z1, AArch64::Z2,
 static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
                              MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
                              CCState &State, Align SlotAlign) {
+  if (LocVT.isScalableVector()) {
+    const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
+        State.getMachineFunction().getSubtarget());
+    const AArch64TargetLowering *TLI = Subtarget.getTargetLowering();
+
+    // We are about to reinvoke the CCAssignFn auto-generated handler. If we
+    // don't unset these flags we will get stuck in an infinite loop forever
+    // invoking the custom handler.
+    ArgFlags.setInConsecutiveRegs(false);
+    ArgFlags.setInConsecutiveRegsLast(false);
+
+    // The calling convention for passing SVE tuples states that in the event
+    // we cannot allocate enough registers for the tuple we should still leave
+    // any remaining registers unallocated. However, when we call the
+    // CCAssignFn again we want it to behave as if all remaining registers are
+    // allocated. This will force the code to pass the tuple indirectly in
+    // accordance with the PCS.
+    bool RegsAllocated[8];
+    for (int I = 0; I < 8; I++) {
+      RegsAllocated[I] = State.isAllocated(ZRegList[I]);
+      State.AllocateReg(ZRegList[I]);
+    }
+
+    auto &It = PendingMembers[0];
+    CCAssignFn *AssignFn =
+        TLI->CCAssignFnForCall(State.getCallingConv(), /*IsVarArg=*/false);
+    if (AssignFn(It.getValNo(), It.getValVT(), It.getValVT(), CCValAssign::Full,
+                 ArgFlags, State))
+      llvm_unreachable("Call operand has unhandled type");
+
+    // Return the flags to how they were before.
+    ArgFlags.setInConsecutiveRegs(true);
+    ArgFlags.setInConsecutiveRegsLast(true);
+
+    // Return the register state back to how it was before, leaving any
+    // unallocated registers available for other smaller types.
+    for (int I = 0; I < 8; I++)
+      if (!RegsAllocated[I])
+        State.DeallocateReg(ZRegList[I]);
+
+    // All pending members have now been allocated
+    PendingMembers.clear();
+    return true;
+  }
+
   unsigned Size = LocVT.getSizeInBits() / 8;
   const Align StackAlign =
       State.getMachineFunction().getDataLayout().getStackAlignment();
@@ -146,13 +191,11 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
     return true;
   }
 
-  if (LocVT.isScalableVector())
-    report_fatal_error(
-        "Passing consecutive scalable vector registers unsupported");
-
-  // Mark all regs in the class as unavailable
-  for (auto Reg : RegList)
-    State.AllocateReg(Reg);
+  if (!LocVT.isScalableVector()) {
+    // Mark all regs in the class as unavailable
+    for (auto Reg : RegList)
+      State.AllocateReg(Reg);
+  }
 
   const Align SlotAlign = Subtarget.isTargetDarwin() ? Align(1) : Align(8);
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 15c67b4..f2613a8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4277,10 +4277,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     assert(!Res && "Call operand has unhandled type");
     (void)Res;
   }
-  assert(ArgLocs.size() == Ins.size());
   SmallVector<SDValue, 16> ArgValues;
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
+  unsigned ExtraArgLocs = 0;
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
 
     if (Ins[i].Flags.isByVal()) {
       // Byval is used for HFAs in the PCS, but the system should work in a
@@ -4408,16 +4408,44 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     if (VA.getLocInfo() == CCValAssign::Indirect) {
       assert(VA.getValVT().isScalableVector() &&
            "Only scalable vectors can be passed indirectly");
-      // If value is passed via pointer - do a load.
-      ArgValue =
-          DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo());
-    }
 
-    if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
-      ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
-                             ArgValue, DAG.getValueType(MVT::i32));
-    InVals.push_back(ArgValue);
+      uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize();
+      unsigned NumParts = 1;
+      if (Ins[i].Flags.isInConsecutiveRegs()) {
+        assert(!Ins[i].Flags.isInConsecutiveRegsLast());
+        while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
+          ++NumParts;
+      }
+
+      MVT PartLoad = VA.getValVT();
+      SDValue Ptr = ArgValue;
+
+      // Ensure we generate all loads for each tuple part, whilst updating the
+      // pointer after each load correctly using vscale.
+      while (NumParts > 0) {
+        ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
+        InVals.push_back(ArgValue);
+        NumParts--;
+        if (NumParts > 0) {
+          SDValue BytesIncrement = DAG.getVScale(
+              DL, Ptr.getValueType(),
+              APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
+          SDNodeFlags Flags;
+          Flags.setNoUnsignedWrap(true);
+          Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                            BytesIncrement, Flags);
+          ExtraArgLocs++;
+          i++;
+        }
+      }
+    } else {
+      if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
+        ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
+                               ArgValue, DAG.getValueType(MVT::i32));
+      InVals.push_back(ArgValue);
+    }
   }
+  assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
 
   // varargs
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
@@ -5015,8 +5043,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   // Walk the register/memloc assignments, inserting copies/loads.
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
+  unsigned ExtraArgLocs = 0;
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
     SDValue Arg = OutVals[i];
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
 
@@ -5058,18 +5087,49 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     case CCValAssign::Indirect:
       assert(VA.getValVT().isScalableVector() &&
              "Only scalable vectors can be passed indirectly");
+
+      uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize();
+      uint64_t PartSize = StoreSize;
+      unsigned NumParts = 1;
+      if (Outs[i].Flags.isInConsecutiveRegs()) {
+        assert(!Outs[i].Flags.isInConsecutiveRegsLast());
+        while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
+          ++NumParts;
+        StoreSize *= NumParts;
+      }
+
       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
       Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
       Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
-      int FI = MFI.CreateStackObject(
-          VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false);
+      int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
       MFI.setStackID(FI, TargetStackID::SVEVector);
 
-      SDValue SpillSlot = DAG.getFrameIndex(
+      MachinePointerInfo MPI =
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+      SDValue Ptr = DAG.getFrameIndex(
           FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
-      Chain = DAG.getStore(
-          Chain, DL, Arg, SpillSlot,
-          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+      SDValue SpillSlot = Ptr;
+
+      // Ensure we generate all stores for each tuple part, whilst updating the
+      // pointer after each store correctly using vscale.
+      while (NumParts) {
+        Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
+        NumParts--;
+        if (NumParts > 0) {
+          SDValue BytesIncrement = DAG.getVScale(
+              DL, Ptr.getValueType(),
+              APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
+          SDNodeFlags Flags;
+          Flags.setNoUnsignedWrap(true);
+
+          MPI = MachinePointerInfo(MPI.getAddrSpace());
+          Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                            BytesIncrement, Flags);
+          ExtraArgLocs++;
+          i++;
+        }
+      }
+
       Arg = SpillSlot;
       break;
     }
@@ -5121,7 +5181,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       uint32_t BEAlign = 0;
       unsigned OpSize;
       if (VA.getLocInfo() == CCValAssign::Indirect)
-        OpSize = VA.getLocVT().getSizeInBits();
+        OpSize = VA.getLocVT().getFixedSizeInBits();
       else
         OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
                                  : VA.getValVT().getSizeInBits();
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
new file mode 100644
index 0000000..806dd7e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -0,0 +1,201 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+sve | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; Make sure callers set up the arguments correctly - tests AArch64ISelLowering::LowerCALL
+
+define float @foo1(double* %x0, double* %x1, double* %x2) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld4d { z1.d, z2.d, z3.d, z4.d }, p0/z, [x0]
+; CHECK-NEXT:    ld4d { z16.d, z17.d, z18.d, z19.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x2]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    fmov s0, #1.00000000
+; CHECK-NEXT:    mov x0, sp
+; CHECK-NEXT:    st1d { z16.d }, p0, [sp]
+; CHECK-NEXT:    st1d { z17.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z18.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    st1d { z19.d }, p0, [x8, #3, mul vl]
+; CHECK-NEXT:    bl callee1
+; CHECK-NEXT:    addvl sp, sp, #4
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %1 = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %2 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x0)
+  %3 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x1)
+  %4 = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1> %1, double* %x2)
+  %call = call float @callee1(float 1.000000e+00, <vscale x 8 x double> %2, <vscale x 8 x double> %3, <vscale x 2 x double> %4)
+  ret float %call
+}
+
+define float @foo2(double* %x0, double* %x1) nounwind {
+; CHECK-LABEL: foo2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    sub sp, sp, #16 // =16
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld4d { z1.d, z2.d, z3.d, z4.d }, p0/z, [x0]
+; CHECK-NEXT:    ld4d { z16.d, z17.d, z18.d, z19.d }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    add x8, sp, #16 // =16
+; CHECK-NEXT:    add x9, sp, #16 // =16
+; CHECK-NEXT:    fmov s0, #1.00000000
+; CHECK-NEXT:    mov w1, #1
+; CHECK-NEXT:    mov w2, #2
+; CHECK-NEXT:    mov w3, #3
+; CHECK-NEXT:    mov w4, #4
+; CHECK-NEXT:    mov w5, #5
+; CHECK-NEXT:    mov w6, #6
+; CHECK-NEXT:    mov w7, #7
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    st1d { z16.d }, p0, [x9]
+; CHECK-NEXT:    st1d { z17.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z18.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    st1d { z19.d }, p0, [x8, #3, mul vl]
+; CHECK-NEXT:    str x8, [sp]
+; CHECK-NEXT:    bl callee2
+; CHECK-NEXT:    addvl sp, sp, #4
+; CHECK-NEXT:    add sp, sp, #16 // =16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %1 = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %2 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x0)
+  %3 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x1)
+  %call = call float @callee2(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, float 1.000000e+00, <vscale x 8 x double> %2, <vscale x 8 x double> %3)
+  ret float %call
+}
+
+define float @foo3(double* %x0, double* %x1, double* %x2) nounwind {
+; CHECK-LABEL: foo3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld4d { z2.d, z3.d, z4.d, z5.d }, p0/z, [x0]
+; CHECK-NEXT:    ld3d { z16.d, z17.d, z18.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x2]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    fmov s0, #1.00000000
+; CHECK-NEXT:    fmov s1, #2.00000000
+; CHECK-NEXT:    mov x0, sp
+; CHECK-NEXT:    st1d { z16.d }, p0, [sp]
+; CHECK-NEXT:    st1d { z17.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z18.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    bl callee3
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %1 = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %2 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x0)
+  %3 = call <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1(<vscale x 2 x i1> %1, double* %x1)
+  %4 = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1> %1, double* %x2)
+  %call = call float @callee3(float 1.000000e+00, float 2.000000e+00, <vscale x 8 x double> %2, <vscale x 6 x double> %3, <vscale x 2 x double> %4)
+  ret float %call
+}
+
+; Make sure callees read the arguments correctly - tests AArch64ISelLowering::LowerFormalArguments
+
+define double @foo4(double %x0, double * %ptr1, double * %ptr2, double * %ptr3, <vscale x 8 x double> %x1, <vscale x 8 x double> %x2, <vscale x 2 x double> %x3) nounwind {
+; CHECK-LABEL: foo4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x3, #1, mul vl]
+; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x3]
+; CHECK-NEXT:    ld1d { z24.d }, p0/z, [x3, #3, mul vl]
+; CHECK-NEXT:    ld1d { z25.d }, p0/z, [x3, #2, mul vl]
+; CHECK-NEXT:    st1d { z4.d }, p0, [x0, #3, mul vl]
+; CHECK-NEXT:    st1d { z3.d }, p0, [x0, #2, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
+; CHECK-NEXT:    st1d { z25.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT:    st1d { z24.d }, p0, [x1, #3, mul vl]
+; CHECK-NEXT:    st1d { z7.d }, p0, [x1]
+; CHECK-NEXT:    st1d { z6.d }, p0, [x1, #1, mul vl]
+; CHECK-NEXT:    st1d { z5.d }, p0, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %ptr1.bc = bitcast double * %ptr1 to <vscale x 8 x double> *
+  store volatile <vscale x 8 x double> %x1, <vscale x 8 x double>* %ptr1.bc
+  %ptr2.bc = bitcast double * %ptr2 to <vscale x 8 x double> *
+  store volatile <vscale x 8 x double> %x2, <vscale x 8 x double>* %ptr2.bc
+  %ptr3.bc = bitcast double * %ptr3 to <vscale x 2 x double> *
+  store volatile <vscale x 2 x double> %x3, <vscale x 2 x double>* %ptr3.bc
+  ret double %x0
+}
+
+define double @foo5(i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, double * %ptr1, double * %ptr2, double %x0, <vscale x 8 x double> %x1, <vscale x 8 x double> %x2) nounwind {
+; CHECK-LABEL: foo5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr x8, [sp]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x8, #1, mul vl]
+; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x8]
+; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x8, #3, mul vl]
+; CHECK-NEXT:    ld1d { z24.d }, p0/z, [x8, #2, mul vl]
+; CHECK-NEXT:    st1d { z4.d }, p0, [x6, #3, mul vl]
+; CHECK-NEXT:    st1d { z3.d }, p0, [x6, #2, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [x6, #1, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x6]
+; CHECK-NEXT:    st1d { z24.d }, p0, [x7, #2, mul vl]
+; CHECK-NEXT:    st1d { z7.d }, p0, [x7, #3, mul vl]
+; CHECK-NEXT:    st1d { z6.d }, p0, [x7]
+; CHECK-NEXT:    st1d { z5.d }, p0, [x7, #1, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %ptr1.bc = bitcast double * %ptr1 to <vscale x 8 x double> *
+  store volatile <vscale x 8 x double> %x1, <vscale x 8 x double>* %ptr1.bc
+  %ptr2.bc = bitcast double * %ptr2 to <vscale x 8 x double> *
+  store volatile <vscale x 8 x double> %x2, <vscale x 8 x double>* %ptr2.bc
+  ret double %x0
+}
+
+define double @foo6(double %x0, double %x1, double * %ptr1, double * %ptr2, <vscale x 8 x double> %x2, <vscale x 6 x double> %x3) nounwind {
+; CHECK-LABEL: foo6:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x2]
+; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x2, #2, mul vl]
+; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x2, #1, mul vl]
+; CHECK-NEXT:    st1d { z5.d }, p0, [x0, #3, mul vl]
+; CHECK-NEXT:    st1d { z4.d }, p0, [x0, #2, mul vl]
+; CHECK-NEXT:    st1d { z3.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [x0]
+; CHECK-NEXT:    st1d { z7.d }, p0, [x1, #1, mul vl]
+; CHECK-NEXT:    st1d { z6.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x1]
+; CHECK-NEXT:    ret
+entry:
+  %ptr1.bc = bitcast double * %ptr1 to <vscale x 8 x double> *
+  store volatile <vscale x 8 x double> %x2, <vscale x 8 x double>* %ptr1.bc
+  %ptr2.bc = bitcast double * %ptr2 to <vscale x 6 x double> *
+  store volatile <vscale x 6 x double> %x3, <vscale x 6 x double>* %ptr2.bc
+  ret double %x0
+}
+
+declare float @callee1(float, <vscale x 8 x double>, <vscale x 8 x double>, <vscale x 2 x double>)
+declare float @callee2(i32, i32, i32, i32, i32, i32, i32, i32, float, <vscale x 8 x double>, <vscale x 8 x double>)
+declare float @callee3(float, float, <vscale x 8 x double>, <vscale x 6 x double>, <vscale x 2 x double>)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
+declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1>, double*)
+declare <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1(<vscale x 2 x i1>, double*)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1>, double*)
+declare double @llvm.aarch64.sve.faddv.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64(<vscale x 8 x double>, i32 immarg)
+declare <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv2f64.nxv6f64(<vscale x 6 x double>, i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll
deleted file mode 100644
index ee88f0b..0000000
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: not --crash llc < %s -mtriple aarch64-linux-gnu -mattr=+sve >/dev/null 2>%t
-; RUN: FileCheck %s < %t
-
-; CHECK: Passing consecutive scalable vector registers unsupported
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-unknown-linux-gnu"
-
-define float @foo(double* %x0, double* %x1) {
-entry:
-  %0 = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-  %1 = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
-  %2 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x0)
-  %3 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x1)
-  %call = call float @callee(float 1.000000e+00, <vscale x 8 x double> %2, <vscale x 8 x double> %3)
-  ret float %call
-}
-
-declare float @callee(float, <vscale x 8 x double>, <vscale x 8 x double>)
-
-declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg)
-declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
-declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1>, double*)
-- 
2.7.4