[SVE] Deal with SVE tuple call arguments correctly when running out of registers

author David Sherwood <david.sherwood@arm.com>

Thu, 22 Oct 2020 08:37:33 +0000 (09:37 +0100)

committer David Sherwood <david.sherwood@arm.com>

Thu, 12 Nov 2020 08:41:50 +0000 (08:41 +0000)
author David Sherwood <david.sherwood@arm.com>
Thu, 22 Oct 2020 08:37:33 +0000 (09:37 +0100)
committer David Sherwood <david.sherwood@arm.com>
Thu, 12 Nov 2020 08:41:50 +0000 (08:41 +0000)
diff --git a/llvm/include/llvm/CodeGen/CallingConvLower.h b/llvm/include/llvm/CodeGen/CallingConvLower.h

index 52c88d8..2fe4e37 100644 (file)
--- a/llvm/include/llvm/CodeGen/CallingConvLower.h
+++ b/llvm/include/llvm/CodeGen/CallingConvLower.h
@@ -340,6 +340,11 @@ public:
      return Regs.size();
    }
  
+  void DeallocateReg(MCPhysReg Reg) {
+    assert(isAllocated(Reg) && "Trying to deallocate an unallocated register");
+    MarkUnallocated(Reg);
+  }
+
    /// AllocateReg - Attempt to allocate one register.  If it is not available,
    /// return zero.  Otherwise, return the register, marking it and any aliases
    /// as allocated.
@@ -570,6 +575,8 @@ public:
  private:
    /// MarkAllocated - Mark a register and all of its aliases as allocated.
    void MarkAllocated(MCPhysReg Reg);
+
+  void MarkUnallocated(MCPhysReg Reg);
  };
  
  } // end namespace llvm
diff --git a/llvm/include/llvm/CodeGen/TargetCallingConv.h b/llvm/include/llvm/CodeGen/TargetCallingConv.h

index 7baf5b2..df974b4 100644 (file)
--- a/llvm/include/llvm/CodeGen/TargetCallingConv.h
+++ b/llvm/include/llvm/CodeGen/TargetCallingConv.h
@@ -122,10 +122,12 @@ namespace ISD {
      void setReturned() { IsReturned = 1; }
  
      bool isInConsecutiveRegs()  const { return IsInConsecutiveRegs; }
-    void setInConsecutiveRegs() { IsInConsecutiveRegs = 1; }
+    void setInConsecutiveRegs(bool Flag = true) { IsInConsecutiveRegs = Flag; }
  
      bool isInConsecutiveRegsLast() const { return IsInConsecutiveRegsLast; }
-    void setInConsecutiveRegsLast() { IsInConsecutiveRegsLast = 1; }
+    void setInConsecutiveRegsLast(bool Flag = true) {
+      IsInConsecutiveRegsLast = Flag;
+    }
  
      bool isSplit()   const { return IsSplit; }
      void setSplit()  { IsSplit = 1; }
diff --git a/llvm/lib/CodeGen/CallingConvLower.cpp b/llvm/lib/CodeGen/CallingConvLower.cpp

index 9afaf95..c9246f6 100644 (file)
--- a/llvm/lib/CodeGen/CallingConvLower.cpp
+++ b/llvm/lib/CodeGen/CallingConvLower.cpp
@@ -63,6 +63,11 @@ void CCState::MarkAllocated(MCPhysReg Reg) {
      UsedRegs[*AI / 32] |= 1 << (*AI & 31);
  }
  
+void CCState::MarkUnallocated(MCPhysReg Reg) {
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    UsedRegs[*AI / 32] &= ~(1 << (*AI & 31));
+}
+
  bool CCState::IsShadowAllocatedReg(MCRegister Reg) const {
    if (!isAllocated(Reg))
      return false;
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp

index 9ae2b46..c51dd48 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -42,6 +42,51 @@ static const MCPhysReg ZRegList[] = {AArch64::Z0, AArch64::Z1, AArch64::Z2,
  static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
                               MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
                               CCState &State, Align SlotAlign) {
+  if (LocVT.isScalableVector()) {
+    const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
+        State.getMachineFunction().getSubtarget());
+    const AArch64TargetLowering *TLI = Subtarget.getTargetLowering();
+
+    // We are about to reinvoke the CCAssignFn auto-generated handler. If we
+    // don't unset these flags we will get stuck in an infinite loop forever
+    // invoking the custom handler.
+    ArgFlags.setInConsecutiveRegs(false);
+    ArgFlags.setInConsecutiveRegsLast(false);
+
+    // The calling convention for passing SVE tuples states that in the event
+    // we cannot allocate enough registers for the tuple we should still leave
+    // any remaining registers unallocated. However, when we call the
+    // CCAssignFn again we want it to behave as if all remaining registers are
+    // allocated. This will force the code to pass the tuple indirectly in
+    // accordance with the PCS.
+    bool RegsAllocated[8];
+    for (int I = 0; I < 8; I++) {
+      RegsAllocated[I] = State.isAllocated(ZRegList[I]);
+      State.AllocateReg(ZRegList[I]);
+    }
+
+    auto &It = PendingMembers[0];
+    CCAssignFn *AssignFn =
+        TLI->CCAssignFnForCall(State.getCallingConv(), /*IsVarArg=*/false);
+    if (AssignFn(It.getValNo(), It.getValVT(), It.getValVT(), CCValAssign::Full,
+                 ArgFlags, State))
+      llvm_unreachable("Call operand has unhandled type");
+
+    // Return the flags to how they were before.
+    ArgFlags.setInConsecutiveRegs(true);
+    ArgFlags.setInConsecutiveRegsLast(true);
+
+    // Return the register state back to how it was before, leaving any
+    // unallocated registers available for other smaller types.
+    for (int I = 0; I < 8; I++)
+      if (!RegsAllocated[I])
+        State.DeallocateReg(ZRegList[I]);
+
+    // All pending members have now been allocated
+    PendingMembers.clear();
+    return true;
+  }
+
    unsigned Size = LocVT.getSizeInBits() / 8;
    const Align StackAlign =
        State.getMachineFunction().getDataLayout().getStackAlignment();
@@ -146,13 +191,11 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
      return true;
    }
  
-  if (LocVT.isScalableVector())
-    report_fatal_error(
-        "Passing consecutive scalable vector registers unsupported");
-
-  // Mark all regs in the class as unavailable
-  for (auto Reg : RegList)
-    State.AllocateReg(Reg);
+  if (!LocVT.isScalableVector()) {
+    // Mark all regs in the class as unavailable
+    for (auto Reg : RegList)
+      State.AllocateReg(Reg);
+  }
  
    const Align SlotAlign = Subtarget.isTargetDarwin() ? Align(1) : Align(8);
  
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

index 15c67b4..f2613a8 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4277,10 +4277,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
      assert(!Res && "Call operand has unhandled type");
      (void)Res;
    }
-  assert(ArgLocs.size() == Ins.size());
    SmallVector<SDValue, 16> ArgValues;
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
+  unsigned ExtraArgLocs = 0;
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
  
      if (Ins[i].Flags.isByVal()) {
        // Byval is used for HFAs in the PCS, but the system should work in a
@@ -4408,16 +4408,44 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
      if (VA.getLocInfo() == CCValAssign::Indirect) {
        assert(VA.getValVT().isScalableVector() &&
             "Only scalable vectors can be passed indirectly");
-      // If value is passed via pointer - do a load.
-      ArgValue =
-          DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo());
-    }
  
-    if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
-      ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
-                             ArgValue, DAG.getValueType(MVT::i32));
-    InVals.push_back(ArgValue);
+      uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize();
+      unsigned NumParts = 1;
+      if (Ins[i].Flags.isInConsecutiveRegs()) {
+        assert(!Ins[i].Flags.isInConsecutiveRegsLast());
+        while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
+          ++NumParts;
+      }
+
+      MVT PartLoad = VA.getValVT();
+      SDValue Ptr = ArgValue;
+
+      // Ensure we generate all loads for each tuple part, whilst updating the
+      // pointer after each load correctly using vscale.
+      while (NumParts > 0) {
+        ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
+        InVals.push_back(ArgValue);
+        NumParts--;
+        if (NumParts > 0) {
+          SDValue BytesIncrement = DAG.getVScale(
+              DL, Ptr.getValueType(),
+              APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
+          SDNodeFlags Flags;
+          Flags.setNoUnsignedWrap(true);
+          Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                            BytesIncrement, Flags);
+          ExtraArgLocs++;
+          i++;
+        }
+      }
+    } else {
+      if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
+        ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
+                               ArgValue, DAG.getValueType(MVT::i32));
+      InVals.push_back(ArgValue);
+    }
    }
+  assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
  
    // varargs
    AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
@@ -5015,8 +5043,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
    }
  
    // Walk the register/memloc assignments, inserting copies/loads.
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
+  unsigned ExtraArgLocs = 0;
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
      SDValue Arg = OutVals[i];
      ISD::ArgFlagsTy Flags = Outs[i].Flags;
  
@@ -5058,18 +5087,49 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
      case CCValAssign::Indirect:
        assert(VA.getValVT().isScalableVector() &&
               "Only scalable vectors can be passed indirectly");
+
+      uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize();
+      uint64_t PartSize = StoreSize;
+      unsigned NumParts = 1;
+      if (Outs[i].Flags.isInConsecutiveRegs()) {
+        assert(!Outs[i].Flags.isInConsecutiveRegsLast());
+        while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
+          ++NumParts;
+        StoreSize *= NumParts;
+      }
+
        MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
        Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
        Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
-      int FI = MFI.CreateStackObject(
-          VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false);
+      int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
        MFI.setStackID(FI, TargetStackID::SVEVector);
  
-      SDValue SpillSlot = DAG.getFrameIndex(
+      MachinePointerInfo MPI =
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+      SDValue Ptr = DAG.getFrameIndex(
            FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
-      Chain = DAG.getStore(
-          Chain, DL, Arg, SpillSlot,
-          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+      SDValue SpillSlot = Ptr;
+
+      // Ensure we generate all stores for each tuple part, whilst updating the
+      // pointer after each store correctly using vscale.
+      while (NumParts) {
+        Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
+        NumParts--;
+        if (NumParts > 0) {
+          SDValue BytesIncrement = DAG.getVScale(
+              DL, Ptr.getValueType(),
+              APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
+          SDNodeFlags Flags;
+          Flags.setNoUnsignedWrap(true);
+
+          MPI = MachinePointerInfo(MPI.getAddrSpace());
+          Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                            BytesIncrement, Flags);
+          ExtraArgLocs++;
+          i++;
+        }
+      }
+
        Arg = SpillSlot;
        break;
      }
@@ -5121,7 +5181,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
        uint32_t BEAlign = 0;
        unsigned OpSize;
        if (VA.getLocInfo() == CCValAssign::Indirect)
-        OpSize = VA.getLocVT().getSizeInBits();
+        OpSize = VA.getLocVT().getFixedSizeInBits();
        else
          OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
                                   : VA.getValVT().getSizeInBits();
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll

new file mode 100644 (file)

index 0000000..806dd7e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -0,0 +1,201 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+sve | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; Make sure callers set up the arguments correctly - tests AArch64ISelLowering::LowerCALL
+
+define float @foo1(double* %x0, double* %x1, double* %x2) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld4d { z1.d, z2.d, z3.d, z4.d }, p0/z, [x0]
+; CHECK-NEXT:    ld4d { z16.d, z17.d, z18.d, z19.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x2]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    fmov s0, #1.00000000
+; CHECK-NEXT:    mov x0, sp
+; CHECK-NEXT:    st1d { z16.d }, p0, [sp]
+; CHECK-NEXT:    st1d { z17.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z18.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    st1d { z19.d }, p0, [x8, #3, mul vl]
+; CHECK-NEXT:    bl callee1
+; CHECK-NEXT:    addvl sp, sp, #4
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %1 = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %2 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x0)
+  %3 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x1)
+  %4 = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1> %1, double* %x2)
+  %call = call float @callee1(float 1.000000e+00, <vscale x 8 x double> %2, <vscale x 8 x double> %3, <vscale x 2 x double> %4)
+  ret float %call
+}
+
+define float @foo2(double* %x0, double* %x1) nounwind {
+; CHECK-LABEL: foo2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    sub sp, sp, #16 // =16
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld4d { z1.d, z2.d, z3.d, z4.d }, p0/z, [x0]
+; CHECK-NEXT:    ld4d { z16.d, z17.d, z18.d, z19.d }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    add x8, sp, #16 // =16
+; CHECK-NEXT:    add x9, sp, #16 // =16
+; CHECK-NEXT:    fmov s0, #1.00000000
+; CHECK-NEXT:    mov w1, #1
+; CHECK-NEXT:    mov w2, #2
+; CHECK-NEXT:    mov w3, #3
+; CHECK-NEXT:    mov w4, #4
+; CHECK-NEXT:    mov w5, #5
+; CHECK-NEXT:    mov w6, #6
+; CHECK-NEXT:    mov w7, #7
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    st1d { z16.d }, p0, [x9]
+; CHECK-NEXT:    st1d { z17.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z18.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    st1d { z19.d }, p0, [x8, #3, mul vl]
+; CHECK-NEXT:    str x8, [sp]
+; CHECK-NEXT:    bl callee2
+; CHECK-NEXT:    addvl sp, sp, #4
+; CHECK-NEXT:    add sp, sp, #16 // =16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %1 = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %2 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x0)
+  %3 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x1)
+  %call = call float @callee2(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, float 1.000000e+00, <vscale x 8 x double> %2, <vscale x 8 x double> %3)
+  ret float %call
+}
+
+define float @foo3(double* %x0, double* %x1, double* %x2) nounwind {
+; CHECK-LABEL: foo3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld4d { z2.d, z3.d, z4.d, z5.d }, p0/z, [x0]
+; CHECK-NEXT:    ld3d { z16.d, z17.d, z18.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x2]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    fmov s0, #1.00000000
+; CHECK-NEXT:    fmov s1, #2.00000000
+; CHECK-NEXT:    mov x0, sp
+; CHECK-NEXT:    st1d { z16.d }, p0, [sp]
+; CHECK-NEXT:    st1d { z17.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z18.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    bl callee3
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %1 = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %2 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x0)
+  %3 = call <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1(<vscale x 2 x i1> %1, double* %x1)
+  %4 = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1> %1, double* %x2)
+  %call = call float @callee3(float 1.000000e+00, float 2.000000e+00, <vscale x 8 x double> %2, <vscale x 6 x double> %3, <vscale x 2 x double> %4)
+  ret float %call
+}
+
+; Make sure callees read the arguments correctly - tests AArch64ISelLowering::LowerFormalArguments
+
+define double @foo4(double %x0, double * %ptr1, double * %ptr2, double * %ptr3, <vscale x 8 x double> %x1, <vscale x 8 x double> %x2, <vscale x 2 x double> %x3) nounwind {
+; CHECK-LABEL: foo4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x3, #1, mul vl]
+; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x3]
+; CHECK-NEXT:    ld1d { z24.d }, p0/z, [x3, #3, mul vl]
+; CHECK-NEXT:    ld1d { z25.d }, p0/z, [x3, #2, mul vl]
+; CHECK-NEXT:    st1d { z4.d }, p0, [x0, #3, mul vl]
+; CHECK-NEXT:    st1d { z3.d }, p0, [x0, #2, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
+; CHECK-NEXT:    st1d { z25.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT:    st1d { z24.d }, p0, [x1, #3, mul vl]
+; CHECK-NEXT:    st1d { z7.d }, p0, [x1]
+; CHECK-NEXT:    st1d { z6.d }, p0, [x1, #1, mul vl]
+; CHECK-NEXT:    st1d { z5.d }, p0, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %ptr1.bc = bitcast double * %ptr1 to <vscale x 8 x double> *
+  store volatile <vscale x 8 x double> %x1, <vscale x 8 x double>* %ptr1.bc
+  %ptr2.bc = bitcast double * %ptr2 to <vscale x 8 x double> *
+  store volatile <vscale x 8 x double> %x2, <vscale x 8 x double>* %ptr2.bc
+  %ptr3.bc = bitcast double * %ptr3 to <vscale x 2 x double> *
+  store volatile <vscale x 2 x double> %x3, <vscale x 2 x double>* %ptr3.bc
+  ret double %x0
+}
+
+define double @foo5(i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, double * %ptr1, double * %ptr2, double %x0, <vscale x 8 x double> %x1, <vscale x 8 x double> %x2) nounwind {
+; CHECK-LABEL: foo5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr x8, [sp]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x8, #1, mul vl]
+; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x8]
+; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x8, #3, mul vl]
+; CHECK-NEXT:    ld1d { z24.d }, p0/z, [x8, #2, mul vl]
+; CHECK-NEXT:    st1d { z4.d }, p0, [x6, #3, mul vl]
+; CHECK-NEXT:    st1d { z3.d }, p0, [x6, #2, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [x6, #1, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x6]
+; CHECK-NEXT:    st1d { z24.d }, p0, [x7, #2, mul vl]
+; CHECK-NEXT:    st1d { z7.d }, p0, [x7, #3, mul vl]
+; CHECK-NEXT:    st1d { z6.d }, p0, [x7]
+; CHECK-NEXT:    st1d { z5.d }, p0, [x7, #1, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %ptr1.bc = bitcast double * %ptr1 to <vscale x 8 x double> *
+  store volatile <vscale x 8 x double> %x1, <vscale x 8 x double>* %ptr1.bc
+  %ptr2.bc = bitcast double * %ptr2 to <vscale x 8 x double> *
+  store volatile <vscale x 8 x double> %x2, <vscale x 8 x double>* %ptr2.bc
+  ret double %x0
+}
+
+define double @foo6(double %x0, double %x1, double * %ptr1, double * %ptr2, <vscale x 8 x double> %x2, <vscale x 6 x double> %x3) nounwind {
+; CHECK-LABEL: foo6:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x2]
+; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x2, #2, mul vl]
+; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x2, #1, mul vl]
+; CHECK-NEXT:    st1d { z5.d }, p0, [x0, #3, mul vl]
+; CHECK-NEXT:    st1d { z4.d }, p0, [x0, #2, mul vl]
+; CHECK-NEXT:    st1d { z3.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [x0]
+; CHECK-NEXT:    st1d { z7.d }, p0, [x1, #1, mul vl]
+; CHECK-NEXT:    st1d { z6.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x1]
+; CHECK-NEXT:    ret
+entry:
+  %ptr1.bc = bitcast double * %ptr1 to <vscale x 8 x double> *
+  store volatile <vscale x 8 x double> %x2, <vscale x 8 x double>* %ptr1.bc
+  %ptr2.bc = bitcast double * %ptr2 to <vscale x 6 x double> *
+  store volatile <vscale x 6 x double> %x3, <vscale x 6 x double>* %ptr2.bc
+  ret double %x0
+}
+
+declare float @callee1(float, <vscale x 8 x double>, <vscale x 8 x double>, <vscale x 2 x double>)
+declare float @callee2(i32, i32, i32, i32, i32, i32, i32, i32, float, <vscale x 8 x double>, <vscale x 8 x double>)
+declare float @callee3(float, float, <vscale x 8 x double>, <vscale x 6 x double>, <vscale x 2 x double>)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
+declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1>, double*)
+declare <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1(<vscale x 2 x i1>, double*)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1>, double*)
+declare double @llvm.aarch64.sve.faddv.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64(<vscale x 8 x double>, i32 immarg)
+declare <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv2f64.nxv6f64(<vscale x 6 x double>, i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll

deleted file mode 100644 (file)

index ee88f0b..0000000
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: not --crash llc < %s -mtriple aarch64-linux-gnu -mattr=+sve >/dev/null 2>%t
-; RUN: FileCheck %s < %t
-
-; CHECK: Passing consecutive scalable vector registers unsupported
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-unknown-linux-gnu"
-
-define float @foo(double* %x0, double* %x1) {
-entry:
-  %0 = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-  %1 = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
-  %2 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x0)
-  %3 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x1)
-  %call = call float @callee(float 1.000000e+00, <vscale x 8 x double> %2, <vscale x 8 x double> %3)
-  ret float %call
-}
-
-declare float @callee(float, <vscale x 8 x double>, <vscale x 8 x double>)
-
-declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg)
-declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
-declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1>, double*)
author	David Sherwood <david.sherwood@arm.com>
	Thu, 22 Oct 2020 08:37:33 +0000 (09:37 +0100)
committer	David Sherwood <david.sherwood@arm.com>
	Thu, 12 Nov 2020 08:41:50 +0000 (08:41 +0000)
llvm/include/llvm/CodeGen/CallingConvLower.h		patch \| blob \| history
llvm/include/llvm/CodeGen/TargetCallingConv.h		patch \| blob \| history
llvm/lib/CodeGen/CallingConvLower.cpp		patch \| blob \| history
llvm/lib/Target/AArch64/AArch64CallingConvention.cpp		patch \| blob \| history
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll	[deleted file]	patch \| blob \| history