From: Fraser Cormack Date: Tue, 2 Mar 2021 11:11:18 +0000 (+0000) Subject: [RISCV] Support fixed-length vectors in the calling convention X-Git-Tag: llvmorg-14-init~12377 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0c5b789c7342ee8384507c3242fc256e23248c4d;p=platform%2Fupstream%2Fllvm.git [RISCV] Support fixed-length vectors in the calling convention This patch adds fixed-length vector support to the calling convention when RVV is used to lower fixed-length vectors. The scheme follows the regular vector calling convention for the argument/return registers, but uses scalable vector container types as the LocVTs, and converts to/from the fixed-length vector value types as required. Fixed-length vector types may be split when the combination of minimum VLEN and the maximum allowable LMUL is not large enough to fully contain the vector. In this case the behaviour differs between fixed-length vectors passed as parameters and as return values: 1. For return values, vectors must be passed entirely via registers or via the stack. 2. For parameters, unlike scalar values, split vectors continue to be passed by value, and are split across multiple registers until there are no remaining registers. Thus vector parameters may be found partly in registers and partly on the stack. As with scalable vectors, the first fixed-length mask vector is passed via v0. Split mask fixed-length vectors are passed first via v0 and then via the next available vector register: v8,v9,etc. The handling of vector return values uses all available argument registers v8-v23 which does not adhere to the calling convention we're supposedly implementing, but since this issue affects both fixed-length and scalable-vector values, it was left as-is. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D97954 --- diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index aa4863f..069e90c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -145,7 +145,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, auto addRegClassForFixedVectors = [this](MVT VT) { unsigned LMul = Subtarget.getLMULForFixedLengthVector(VT); const TargetRegisterClass *RC; - if (LMul == 1) + if (LMul == 1 || VT.getVectorElementType() == MVT::i1) RC = &RISCV::VRRegClass; else if (LMul == 2) RC = &RISCV::VRM2RegClass; @@ -4939,8 +4939,8 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64; // Any return value split in to more than two values can't be returned - // directly. - if (IsRet && ValNo > 1) + // directly. Vectors are returned via the available vector registers. + if (!LocVT.isVector() && IsRet && ValNo > 1) return true; // UseGPRForF16_F32 if targeting one of the soft-float ABIs, if passing a @@ -5031,9 +5031,15 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, return false; } + // Fixed-length vectors are located in the corresponding scalable-vector + // container types. + if (ValVT.isFixedLengthVector()) + LocVT = TLI.getContainerForFixedLengthVector(LocVT); + // Split arguments might be passed indirectly, so keep track of the pending - // values. - if (ArgFlags.isSplit() || !PendingLocs.empty()) { + // values. Split vectors are passed via a mix of registers and indirectly, so + // treat them as we would any other argument. + if (!LocVT.isVector() && (ArgFlags.isSplit() || !PendingLocs.empty())) { LocVT = XLenVT; LocInfo = CCValAssign::Indirect; PendingLocs.push_back( @@ -5046,7 +5052,7 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, // If the split argument only had two elements, it should be passed directly // in registers or on the stack. - if (ArgFlags.isSplitEnd() && PendingLocs.size() <= 2) { + if (!LocVT.isVector() && ArgFlags.isSplitEnd() && PendingLocs.size() <= 2) { assert(PendingLocs.size() == 2 && "Unexpected PendingLocs.size()"); // Apply the normal calling convention rules to the first half of the // split argument. @@ -5066,7 +5072,7 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, Reg = State.AllocateReg(ArgFPR32s); else if (ValVT == MVT::f64 && !UseGPRForF64) Reg = State.AllocateReg(ArgFPR64s); - else if (ValVT.isScalableVector()) { + else if (ValVT.isVector()) { const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT); if (RC == &RISCV::VRRegClass) { // Assign the first mask argument to V0. @@ -5088,6 +5094,12 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, llvm_unreachable("Unhandled class register for ValueType"); } if (!Reg) { + // For return values, the vector must be passed fully via registers or + // via the stack. + // FIXME: The proposed vector ABI only mandates v8-v15 for return values, + // but we're using all of them. + if (IsRet) + return true; LocInfo = CCValAssign::Indirect; // Try using a GPR to pass the address Reg = State.AllocateReg(ArgGPRs); @@ -5117,8 +5129,8 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, } assert((!UseGPRForF16_F32 || !UseGPRForF64 || LocVT == XLenVT || - (TLI.getSubtarget().hasStdExtV() && ValVT.isScalableVector())) && - "Expected an XLenVT or scalable vector types at this stage"); + (TLI.getSubtarget().hasStdExtV() && ValVT.isVector())) && + "Expected an XLenVT or vector types at this stage"); if (Reg) { State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); @@ -5139,8 +5151,7 @@ template static Optional preAssignMask(const ArgTy &Args) { for (const auto &ArgIdx : enumerate(Args)) { MVT ArgVT = ArgIdx.value().VT; - if (ArgVT.isScalableVector() && - ArgVT.getVectorElementType().SimpleTy == MVT::i1) + if (ArgVT.isVector() && ArgVT.getVectorElementType() == MVT::i1) return ArgIdx.index(); } return None; @@ -5206,11 +5217,14 @@ void RISCVTargetLowering::analyzeOutputArgs( // Convert Val to a ValVT. Should not be called for CCValAssign::Indirect // values. static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val, - const CCValAssign &VA, const SDLoc &DL) { + const CCValAssign &VA, const SDLoc &DL, + const RISCVSubtarget &Subtarget) { switch (VA.getLocInfo()) { default: llvm_unreachable("Unexpected CCValAssign::LocInfo"); case CCValAssign::Full: + if (VA.getValVT().isFixedLengthVector() && VA.getLocVT().isScalableVector()) + Val = convertFromScalableVector(VA.getValVT(), Val, DAG, Subtarget); break; case CCValAssign::BCvt: if (VA.getLocVT().isInteger() && VA.getValVT() == MVT::f16) @@ -5241,17 +5255,20 @@ static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain, if (VA.getLocInfo() == CCValAssign::Indirect) return Val; - return convertLocVTToValVT(DAG, Val, VA, DL); + return convertLocVTToValVT(DAG, Val, VA, DL, TLI.getSubtarget()); } static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val, - const CCValAssign &VA, const SDLoc &DL) { + const CCValAssign &VA, const SDLoc &DL, + const RISCVSubtarget &Subtarget) { EVT LocVT = VA.getLocVT(); switch (VA.getLocInfo()) { default: llvm_unreachable("Unexpected CCValAssign::LocInfo"); case CCValAssign::Full: + if (VA.getValVT().isFixedLengthVector() && LocVT.isScalableVector()) + Val = convertToScalableVector(LocVT, Val, DAG, Subtarget); break; case CCValAssign::BCvt: if (VA.getLocVT().isInteger() && VA.getValVT() == MVT::f16) @@ -5512,14 +5529,17 @@ SDValue RISCVTargetLowering::LowerFormalArguments( if (VA.getLocInfo() == CCValAssign::Indirect) { // If the original argument was split and passed by reference (e.g. i128 // on RV32), we need to load all parts of it here (using the same - // address). + // address). Vectors may be partly split to registers and partly to the + // stack, in which case the base address is partly offset and subsequent + // stores are relative to that. InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo())); unsigned ArgIndex = Ins[i].OrigArgIndex; - assert(Ins[i].PartOffset == 0); + unsigned ArgPartOffset = Ins[i].PartOffset; + assert(VA.getValVT().isVector() || ArgPartOffset == 0); while (i + 1 != e && Ins[i + 1].OrigArgIndex == ArgIndex) { CCValAssign &PartVA = ArgLocs[i + 1]; - unsigned PartOffset = Ins[i + 1].PartOffset; + unsigned PartOffset = Ins[i + 1].PartOffset - ArgPartOffset; SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue, DAG.getIntPtrConstant(PartOffset, DL)); InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address, @@ -5789,12 +5809,16 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.getStore(Chain, DL, ArgValue, SpillSlot, MachinePointerInfo::getFixedStack(MF, FI))); // If the original argument was split (e.g. i128), we need - // to store all parts of it here (and pass just one address). + // to store the required parts of it here (and pass just one address). + // Vectors may be partly split to registers and partly to the stack, in + // which case the base address is partly offset and subsequent stores are + // relative to that. unsigned ArgIndex = Outs[i].OrigArgIndex; - assert(Outs[i].PartOffset == 0); + unsigned ArgPartOffset = Outs[i].PartOffset; + assert(VA.getValVT().isVector() || ArgPartOffset == 0); while (i + 1 != e && Outs[i + 1].OrigArgIndex == ArgIndex) { SDValue PartValue = OutVals[i + 1]; - unsigned PartOffset = Outs[i + 1].PartOffset; + unsigned PartOffset = Outs[i + 1].PartOffset - ArgPartOffset; SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot, DAG.getIntPtrConstant(PartOffset, DL)); MemOpChains.push_back( @@ -5804,7 +5828,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, } ArgValue = SpillSlot; } else { - ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL); + ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL, Subtarget); } // Use local copy if it is a byval arg. @@ -5940,7 +5964,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, RetValue2); } - RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL); + RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL, Subtarget); InVals.push_back(RetValue); } @@ -6026,7 +6050,7 @@ RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, RetOps.push_back(DAG.getRegister(RegHi, MVT::i32)); } else { // Handle a 'normal' return. - Val = convertValVTToLocVT(DAG, Val, VA, DL); + Val = convertValVTToLocVT(DAG, Val, VA, DL, Subtarget); Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Glue); if (STI.isRegisterReservedByUser(VA.getLocReg())) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll new file mode 100644 index 0000000..b5041b6 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll @@ -0,0 +1,1170 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX8 +; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=4 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX4 +; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2 +; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1 + +define <4 x i8> @ret_v4i8(<4 x i8>* %p) { +; CHECK-LABEL: ret_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 4, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: ret + %v = load <4 x i8>, <4 x i8>* %p + ret <4 x i8> %v +} + +define <4 x i32> @ret_v4i32(<4 x i32>* %p) { +; CHECK-LABEL: ret_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: ret + %v = load <4 x i32>, <4 x i32>* %p + ret <4 x i32> %v +} + +define <8 x i32> @ret_v8i32(<8 x i32>* %p) { +; LMULMAX8-LABEL: ret_v8i32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX8-NEXT: vle32.v v8, (a0) +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v8i32: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX4-NEXT: vle32.v v8, (a0) +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_v8i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v8, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_v8i32: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: vle32.v v9, (a0) +; LMULMAX1-NEXT: ret + %v = load <8 x i32>, <8 x i32>* %p + ret <8 x i32> %v +} + +define <16 x i64> @ret_v16i64(<16 x i64>* %p) { +; LMULMAX8-LABEL: ret_v16i64: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: vsetivli a1, 16, e64,m8,ta,mu +; LMULMAX8-NEXT: vle64.v v8, (a0) +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v16i64: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a1, 8, e64,m4,ta,mu +; LMULMAX4-NEXT: vle64.v v8, (a0) +; LMULMAX4-NEXT: addi a0, a0, 64 +; LMULMAX4-NEXT: vle64.v v12, (a0) +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_v16i64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v8, (a0) +; LMULMAX2-NEXT: addi a1, a0, 32 +; LMULMAX2-NEXT: vle64.v v10, (a1) +; LMULMAX2-NEXT: addi a1, a0, 64 +; LMULMAX2-NEXT: vle64.v v12, (a1) +; LMULMAX2-NEXT: addi a0, a0, 96 +; LMULMAX2-NEXT: vle64.v v14, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_v16i64: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-NEXT: vle64.v v8, (a0) +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle64.v v9, (a1) +; LMULMAX1-NEXT: addi a1, a0, 32 +; LMULMAX1-NEXT: vle64.v v10, (a1) +; LMULMAX1-NEXT: addi a1, a0, 48 +; LMULMAX1-NEXT: vle64.v v11, (a1) +; LMULMAX1-NEXT: addi a1, a0, 64 +; LMULMAX1-NEXT: vle64.v v12, (a1) +; LMULMAX1-NEXT: addi a1, a0, 80 +; LMULMAX1-NEXT: vle64.v v13, (a1) +; LMULMAX1-NEXT: addi a1, a0, 96 +; LMULMAX1-NEXT: vle64.v v14, (a1) +; LMULMAX1-NEXT: addi a0, a0, 112 +; LMULMAX1-NEXT: vle64.v v15, (a0) +; LMULMAX1-NEXT: ret + %v = load <16 x i64>, <16 x i64>* %p + ret <16 x i64> %v +} + +define <8 x i1> @ret_mask_v8i1(<8 x i1>* %p) { +; CHECK-LABEL: ret_mask_v8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 8, e8,m1,ta,mu +; CHECK-NEXT: vle1.v v0, (a0) +; CHECK-NEXT: ret + %v = load <8 x i1>, <8 x i1>* %p + ret <8 x i1> %v +} + +define <32 x i1> @ret_mask_v32i1(<32 x i1>* %p) { +; LMULMAX8-LABEL: ret_mask_v32i1: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a1, zero, 32 +; LMULMAX8-NEXT: vsetvli a1, a1, e8,m2,ta,mu +; LMULMAX8-NEXT: vle1.v v0, (a0) +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_mask_v32i1: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi a1, zero, 32 +; LMULMAX4-NEXT: vsetvli a1, a1, e8,m2,ta,mu +; LMULMAX4-NEXT: vle1.v v0, (a0) +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_mask_v32i1: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a1, zero, 32 +; LMULMAX2-NEXT: vsetvli a1, a1, e8,m2,ta,mu +; LMULMAX2-NEXT: vle1.v v0, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_mask_v32i1: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a1, 16, e8,m1,ta,mu +; LMULMAX1-NEXT: vle1.v v0, (a0) +; LMULMAX1-NEXT: addi a0, a0, 2 +; LMULMAX1-NEXT: vle1.v v8, (a0) +; LMULMAX1-NEXT: ret + %v = load <32 x i1>, <32 x i1>* %p + ret <32 x i1> %v +} + +; Return the vector via registers v8-v23 +define <64 x i32> @ret_split_v64i32(<64 x i32>* %x) { +; LMULMAX8-LABEL: ret_split_v64i32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a1, zero, 32 +; LMULMAX8-NEXT: vsetvli a1, a1, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v8, (a0) +; LMULMAX8-NEXT: addi a0, a0, 128 +; LMULMAX8-NEXT: vle32.v v16, (a0) +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_split_v64i32: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a1, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vle32.v v8, (a0) +; LMULMAX4-NEXT: addi a1, a0, 64 +; LMULMAX4-NEXT: vle32.v v12, (a1) +; LMULMAX4-NEXT: addi a1, a0, 128 +; LMULMAX4-NEXT: vle32.v v16, (a1) +; LMULMAX4-NEXT: addi a0, a0, 192 +; LMULMAX4-NEXT: vle32.v v20, (a0) +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_split_v64i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v8, (a0) +; LMULMAX2-NEXT: addi a1, a0, 32 +; LMULMAX2-NEXT: vle32.v v10, (a1) +; LMULMAX2-NEXT: addi a1, a0, 64 +; LMULMAX2-NEXT: vle32.v v12, (a1) +; LMULMAX2-NEXT: addi a1, a0, 96 +; LMULMAX2-NEXT: vle32.v v14, (a1) +; LMULMAX2-NEXT: addi a1, a0, 128 +; LMULMAX2-NEXT: vle32.v v16, (a1) +; LMULMAX2-NEXT: addi a1, a0, 160 +; LMULMAX2-NEXT: vle32.v v18, (a1) +; LMULMAX2-NEXT: addi a1, a0, 192 +; LMULMAX2-NEXT: vle32.v v20, (a1) +; LMULMAX2-NEXT: addi a0, a0, 224 +; LMULMAX2-NEXT: vle32.v v22, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_split_v64i32: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle32.v v9, (a1) +; LMULMAX1-NEXT: addi a1, a0, 32 +; LMULMAX1-NEXT: vle32.v v10, (a1) +; LMULMAX1-NEXT: addi a1, a0, 48 +; LMULMAX1-NEXT: vle32.v v11, (a1) +; LMULMAX1-NEXT: addi a1, a0, 64 +; LMULMAX1-NEXT: vle32.v v12, (a1) +; LMULMAX1-NEXT: addi a1, a0, 80 +; LMULMAX1-NEXT: vle32.v v13, (a1) +; LMULMAX1-NEXT: addi a1, a0, 96 +; LMULMAX1-NEXT: vle32.v v14, (a1) +; LMULMAX1-NEXT: addi a1, a0, 112 +; LMULMAX1-NEXT: vle32.v v15, (a1) +; LMULMAX1-NEXT: addi a1, a0, 128 +; LMULMAX1-NEXT: vle32.v v16, (a1) +; LMULMAX1-NEXT: addi a1, a0, 144 +; LMULMAX1-NEXT: vle32.v v17, (a1) +; LMULMAX1-NEXT: addi a1, a0, 160 +; LMULMAX1-NEXT: vle32.v v18, (a1) +; LMULMAX1-NEXT: addi a1, a0, 176 +; LMULMAX1-NEXT: vle32.v v19, (a1) +; LMULMAX1-NEXT: addi a1, a0, 192 +; LMULMAX1-NEXT: vle32.v v20, (a1) +; LMULMAX1-NEXT: addi a1, a0, 208 +; LMULMAX1-NEXT: vle32.v v21, (a1) +; LMULMAX1-NEXT: addi a1, a0, 224 +; LMULMAX1-NEXT: vle32.v v22, (a1) +; LMULMAX1-NEXT: addi a0, a0, 240 +; LMULMAX1-NEXT: vle32.v v23, (a0) +; LMULMAX1-NEXT: ret + %v = load <64 x i32>, <64 x i32>* %x + ret <64 x i32> %v +} + +; Return the vector fully via the stack +define <128 x i32> @ret_split_v128i32(<128 x i32>* %x) { +; LMULMAX8-LABEL: ret_split_v128i32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a2, zero, 32 +; LMULMAX8-NEXT: vsetvli a2, a2, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v8, (a1) +; LMULMAX8-NEXT: addi a2, a1, 128 +; LMULMAX8-NEXT: vle32.v v16, (a2) +; LMULMAX8-NEXT: addi a2, a1, 384 +; LMULMAX8-NEXT: vle32.v v24, (a2) +; LMULMAX8-NEXT: addi a1, a1, 256 +; LMULMAX8-NEXT: vle32.v v0, (a1) +; LMULMAX8-NEXT: addi a1, a0, 384 +; LMULMAX8-NEXT: vse32.v v24, (a1) +; LMULMAX8-NEXT: addi a1, a0, 256 +; LMULMAX8-NEXT: vse32.v v0, (a1) +; LMULMAX8-NEXT: addi a1, a0, 128 +; LMULMAX8-NEXT: vse32.v v16, (a1) +; LMULMAX8-NEXT: vse32.v v8, (a0) +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_split_v128i32: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a2, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vle32.v v28, (a1) +; LMULMAX4-NEXT: addi a2, a1, 64 +; LMULMAX4-NEXT: vle32.v v8, (a2) +; LMULMAX4-NEXT: addi a2, a1, 128 +; LMULMAX4-NEXT: vle32.v v12, (a2) +; LMULMAX4-NEXT: addi a2, a1, 192 +; LMULMAX4-NEXT: vle32.v v16, (a2) +; LMULMAX4-NEXT: addi a2, a1, 256 +; LMULMAX4-NEXT: vle32.v v20, (a2) +; LMULMAX4-NEXT: addi a2, a1, 320 +; LMULMAX4-NEXT: vle32.v v24, (a2) +; LMULMAX4-NEXT: addi a2, a1, 448 +; LMULMAX4-NEXT: vle32.v v0, (a2) +; LMULMAX4-NEXT: addi a1, a1, 384 +; LMULMAX4-NEXT: vle32.v v4, (a1) +; LMULMAX4-NEXT: addi a1, a0, 448 +; LMULMAX4-NEXT: vse32.v v0, (a1) +; LMULMAX4-NEXT: addi a1, a0, 384 +; LMULMAX4-NEXT: vse32.v v4, (a1) +; LMULMAX4-NEXT: addi a1, a0, 320 +; LMULMAX4-NEXT: vse32.v v24, (a1) +; LMULMAX4-NEXT: addi a1, a0, 256 +; LMULMAX4-NEXT: vse32.v v20, (a1) +; LMULMAX4-NEXT: addi a1, a0, 192 +; LMULMAX4-NEXT: vse32.v v16, (a1) +; LMULMAX4-NEXT: addi a1, a0, 128 +; LMULMAX4-NEXT: vse32.v v12, (a1) +; LMULMAX4-NEXT: addi a1, a0, 64 +; LMULMAX4-NEXT: vse32.v v8, (a1) +; LMULMAX4-NEXT: vse32.v v28, (a0) +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_split_v128i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a1) +; LMULMAX2-NEXT: addi a2, a1, 32 +; LMULMAX2-NEXT: vle32.v v28, (a2) +; LMULMAX2-NEXT: addi a2, a1, 64 +; LMULMAX2-NEXT: vle32.v v30, (a2) +; LMULMAX2-NEXT: addi a2, a1, 96 +; LMULMAX2-NEXT: vle32.v v8, (a2) +; LMULMAX2-NEXT: addi a2, a1, 128 +; LMULMAX2-NEXT: vle32.v v10, (a2) +; LMULMAX2-NEXT: addi a2, a1, 160 +; LMULMAX2-NEXT: vle32.v v12, (a2) +; LMULMAX2-NEXT: addi a2, a1, 192 +; LMULMAX2-NEXT: vle32.v v14, (a2) +; LMULMAX2-NEXT: addi a2, a1, 224 +; LMULMAX2-NEXT: vle32.v v16, (a2) +; LMULMAX2-NEXT: addi a2, a1, 256 +; LMULMAX2-NEXT: vle32.v v18, (a2) +; LMULMAX2-NEXT: addi a2, a1, 288 +; LMULMAX2-NEXT: vle32.v v20, (a2) +; LMULMAX2-NEXT: addi a2, a1, 320 +; LMULMAX2-NEXT: vle32.v v22, (a2) +; LMULMAX2-NEXT: addi a2, a1, 352 +; LMULMAX2-NEXT: vle32.v v24, (a2) +; LMULMAX2-NEXT: addi a2, a1, 384 +; LMULMAX2-NEXT: vle32.v v0, (a2) +; LMULMAX2-NEXT: addi a2, a1, 416 +; LMULMAX2-NEXT: vle32.v v2, (a2) +; LMULMAX2-NEXT: addi a2, a1, 480 +; LMULMAX2-NEXT: vle32.v v4, (a2) +; LMULMAX2-NEXT: addi a1, a1, 448 +; LMULMAX2-NEXT: vle32.v v6, (a1) +; LMULMAX2-NEXT: addi a1, a0, 480 +; LMULMAX2-NEXT: vse32.v v4, (a1) +; LMULMAX2-NEXT: addi a1, a0, 448 +; LMULMAX2-NEXT: vse32.v v6, (a1) +; LMULMAX2-NEXT: addi a1, a0, 416 +; LMULMAX2-NEXT: vse32.v v2, (a1) +; LMULMAX2-NEXT: addi a1, a0, 384 +; LMULMAX2-NEXT: vse32.v v0, (a1) +; LMULMAX2-NEXT: addi a1, a0, 352 +; LMULMAX2-NEXT: vse32.v v24, (a1) +; LMULMAX2-NEXT: addi a1, a0, 320 +; LMULMAX2-NEXT: vse32.v v22, (a1) +; LMULMAX2-NEXT: addi a1, a0, 288 +; LMULMAX2-NEXT: vse32.v v20, (a1) +; LMULMAX2-NEXT: addi a1, a0, 256 +; LMULMAX2-NEXT: vse32.v v18, (a1) +; LMULMAX2-NEXT: addi a1, a0, 224 +; LMULMAX2-NEXT: vse32.v v16, (a1) +; LMULMAX2-NEXT: addi a1, a0, 192 +; LMULMAX2-NEXT: vse32.v v14, (a1) +; LMULMAX2-NEXT: addi a1, a0, 160 +; LMULMAX2-NEXT: vse32.v v12, (a1) +; LMULMAX2-NEXT: addi a1, a0, 128 +; LMULMAX2-NEXT: vse32.v v10, (a1) +; LMULMAX2-NEXT: addi a1, a0, 96 +; LMULMAX2-NEXT: vse32.v v8, (a1) +; LMULMAX2-NEXT: addi a1, a0, 64 +; LMULMAX2-NEXT: vse32.v v30, (a1) +; LMULMAX2-NEXT: addi a1, a0, 32 +; LMULMAX2-NEXT: vse32.v v28, (a1) +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_split_v128i32: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v25, (a1) +; LMULMAX1-NEXT: addi a2, a1, 16 +; LMULMAX1-NEXT: vle32.v v26, (a2) +; LMULMAX1-NEXT: addi a2, a1, 32 +; LMULMAX1-NEXT: vle32.v v27, (a2) +; LMULMAX1-NEXT: addi a2, a1, 48 +; LMULMAX1-NEXT: vle32.v v28, (a2) +; LMULMAX1-NEXT: addi a2, a1, 64 +; LMULMAX1-NEXT: vle32.v v29, (a2) +; LMULMAX1-NEXT: addi a2, a1, 80 +; LMULMAX1-NEXT: vle32.v v30, (a2) +; LMULMAX1-NEXT: addi a2, a1, 96 +; LMULMAX1-NEXT: vle32.v v31, (a2) +; LMULMAX1-NEXT: addi a2, a1, 112 +; LMULMAX1-NEXT: vle32.v v8, (a2) +; LMULMAX1-NEXT: addi a2, a1, 128 +; LMULMAX1-NEXT: vle32.v v9, (a2) +; LMULMAX1-NEXT: addi a2, a1, 144 +; LMULMAX1-NEXT: vle32.v v10, (a2) +; LMULMAX1-NEXT: addi a2, a1, 160 +; LMULMAX1-NEXT: vle32.v v11, (a2) +; LMULMAX1-NEXT: addi a2, a1, 176 +; LMULMAX1-NEXT: vle32.v v12, (a2) +; LMULMAX1-NEXT: addi a2, a1, 192 +; LMULMAX1-NEXT: vle32.v v13, (a2) +; LMULMAX1-NEXT: addi a2, a1, 208 +; LMULMAX1-NEXT: vle32.v v14, (a2) +; LMULMAX1-NEXT: addi a2, a1, 224 +; LMULMAX1-NEXT: vle32.v v15, (a2) +; LMULMAX1-NEXT: addi a2, a1, 240 +; LMULMAX1-NEXT: vle32.v v16, (a2) +; LMULMAX1-NEXT: addi a2, a1, 256 +; LMULMAX1-NEXT: vle32.v v17, (a2) +; LMULMAX1-NEXT: addi a2, a1, 272 +; LMULMAX1-NEXT: vle32.v v18, (a2) +; LMULMAX1-NEXT: addi a2, a1, 288 +; LMULMAX1-NEXT: vle32.v v19, (a2) +; LMULMAX1-NEXT: addi a2, a1, 304 +; LMULMAX1-NEXT: vle32.v v20, (a2) +; LMULMAX1-NEXT: addi a2, a1, 320 +; LMULMAX1-NEXT: vle32.v v21, (a2) +; LMULMAX1-NEXT: addi a2, a1, 336 +; LMULMAX1-NEXT: vle32.v v22, (a2) +; LMULMAX1-NEXT: addi a2, a1, 352 +; LMULMAX1-NEXT: vle32.v v23, (a2) +; LMULMAX1-NEXT: addi a2, a1, 368 +; LMULMAX1-NEXT: vle32.v v24, (a2) +; LMULMAX1-NEXT: addi a2, a1, 384 +; LMULMAX1-NEXT: vle32.v v0, (a2) +; LMULMAX1-NEXT: addi a2, a1, 400 +; LMULMAX1-NEXT: vle32.v v1, (a2) +; LMULMAX1-NEXT: addi a2, a1, 416 +; LMULMAX1-NEXT: vle32.v v2, (a2) +; LMULMAX1-NEXT: addi a2, a1, 432 +; LMULMAX1-NEXT: vle32.v v3, (a2) +; LMULMAX1-NEXT: addi a2, a1, 448 +; LMULMAX1-NEXT: vle32.v v4, (a2) +; LMULMAX1-NEXT: addi a2, a1, 464 +; LMULMAX1-NEXT: vle32.v v5, (a2) +; LMULMAX1-NEXT: addi a2, a1, 496 +; LMULMAX1-NEXT: vle32.v v6, (a2) +; LMULMAX1-NEXT: addi a1, a1, 480 +; LMULMAX1-NEXT: vle32.v v7, (a1) +; LMULMAX1-NEXT: addi a1, a0, 496 +; LMULMAX1-NEXT: vse32.v v6, (a1) +; LMULMAX1-NEXT: addi a1, a0, 480 +; LMULMAX1-NEXT: vse32.v v7, (a1) +; LMULMAX1-NEXT: addi a1, a0, 464 +; LMULMAX1-NEXT: vse32.v v5, (a1) +; LMULMAX1-NEXT: addi a1, a0, 448 +; LMULMAX1-NEXT: vse32.v v4, (a1) +; LMULMAX1-NEXT: addi a1, a0, 432 +; LMULMAX1-NEXT: vse32.v v3, (a1) +; LMULMAX1-NEXT: addi a1, a0, 416 +; LMULMAX1-NEXT: vse32.v v2, (a1) +; LMULMAX1-NEXT: addi a1, a0, 400 +; LMULMAX1-NEXT: vse32.v v1, (a1) +; LMULMAX1-NEXT: addi a1, a0, 384 +; LMULMAX1-NEXT: vse32.v v0, (a1) +; LMULMAX1-NEXT: addi a1, a0, 368 +; LMULMAX1-NEXT: vse32.v v24, (a1) +; LMULMAX1-NEXT: addi a1, a0, 352 +; LMULMAX1-NEXT: vse32.v v23, (a1) +; LMULMAX1-NEXT: addi a1, a0, 336 +; LMULMAX1-NEXT: vse32.v v22, (a1) +; LMULMAX1-NEXT: addi a1, a0, 320 +; LMULMAX1-NEXT: vse32.v v21, (a1) +; LMULMAX1-NEXT: addi a1, a0, 304 +; LMULMAX1-NEXT: vse32.v v20, (a1) +; LMULMAX1-NEXT: addi a1, a0, 288 +; LMULMAX1-NEXT: vse32.v v19, (a1) +; LMULMAX1-NEXT: addi a1, a0, 272 +; LMULMAX1-NEXT: vse32.v v18, (a1) +; LMULMAX1-NEXT: addi a1, a0, 256 +; LMULMAX1-NEXT: vse32.v v17, (a1) +; LMULMAX1-NEXT: addi a1, a0, 240 +; LMULMAX1-NEXT: vse32.v v16, (a1) +; LMULMAX1-NEXT: addi a1, a0, 224 +; LMULMAX1-NEXT: vse32.v v15, (a1) +; LMULMAX1-NEXT: addi a1, a0, 208 +; LMULMAX1-NEXT: vse32.v v14, (a1) +; LMULMAX1-NEXT: addi a1, a0, 192 +; LMULMAX1-NEXT: vse32.v v13, (a1) +; LMULMAX1-NEXT: addi a1, a0, 176 +; LMULMAX1-NEXT: vse32.v v12, (a1) +; LMULMAX1-NEXT: addi a1, a0, 160 +; LMULMAX1-NEXT: vse32.v v11, (a1) +; LMULMAX1-NEXT: addi a1, a0, 144 +; LMULMAX1-NEXT: vse32.v v10, (a1) +; LMULMAX1-NEXT: addi a1, a0, 128 +; LMULMAX1-NEXT: vse32.v v9, (a1) +; LMULMAX1-NEXT: addi a1, a0, 112 +; LMULMAX1-NEXT: vse32.v v8, (a1) +; LMULMAX1-NEXT: addi a1, a0, 96 +; LMULMAX1-NEXT: vse32.v v31, (a1) +; LMULMAX1-NEXT: addi a1, a0, 80 +; LMULMAX1-NEXT: vse32.v v30, (a1) +; LMULMAX1-NEXT: addi a1, a0, 64 +; LMULMAX1-NEXT: vse32.v v29, (a1) +; LMULMAX1-NEXT: addi a1, a0, 48 +; LMULMAX1-NEXT: vse32.v v28, (a1) +; LMULMAX1-NEXT: addi a1, a0, 32 +; LMULMAX1-NEXT: vse32.v v27, (a1) +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vse32.v v26, (a1) +; LMULMAX1-NEXT: vse32.v v25, (a0) +; LMULMAX1-NEXT: ret + %v = load <128 x i32>, <128 x i32>* %x + ret <128 x i32> %v +} + +define <4 x i8> @ret_v8i8_param_v4i8(<4 x i8> %v) { +; CHECK-LABEL: ret_v8i8_param_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a0, 4, e8,m1,ta,mu +; CHECK-NEXT: vadd.vi v8, v8, 2 +; CHECK-NEXT: ret + %r = add <4 x i8> %v, + ret <4 x i8> %r +} + +define <4 x i8> @ret_v4i8_param_v4i8_v4i8(<4 x i8> %v, <4 x i8> %w) { +; CHECK-LABEL: ret_v4i8_param_v4i8_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a0, 4, e8,m1,ta,mu +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %r = add <4 x i8> %v, %w + ret <4 x i8> %r +} + +define <4 x i64> @ret_v4i64_param_v4i64_v4i64(<4 x i64> %v, <4 x i64> %w) { +; LMULMAX8-LABEL: ret_v4i64_param_v4i64_v4i64: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: vsetivli a0, 4, e64,m2,ta,mu +; LMULMAX8-NEXT: vadd.vv v8, v8, v10 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v4i64_param_v4i64_v4i64: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a0, 4, e64,m2,ta,mu +; LMULMAX4-NEXT: vadd.vv v8, v8, v10 +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_v4i64_param_v4i64_v4i64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a0, 4, e64,m2,ta,mu +; LMULMAX2-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_v4i64_param_v4i64_v4i64: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a0, 2, e64,m1,ta,mu +; LMULMAX1-NEXT: vadd.vv v8, v8, v10 +; LMULMAX1-NEXT: vadd.vv v9, v9, v11 +; LMULMAX1-NEXT: ret + %r = add <4 x i64> %v, %w + ret <4 x i64> %r +} + +define <8 x i1> @ret_v8i1_param_v8i1_v8i1(<8 x i1> %v, <8 x i1> %w) { +; CHECK-LABEL: ret_v8i1_param_v8i1_v8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a0, 8, e8,m1,ta,mu +; CHECK-NEXT: vmxor.mm v0, v0, v8 +; CHECK-NEXT: ret + %r = xor <8 x i1> %v, %w + ret <8 x i1> %r +} + +define <32 x i1> @ret_v32i1_param_v32i1_v32i1(<32 x i1> %v, <32 x i1> %w) { +; LMULMAX8-LABEL: ret_v32i1_param_v32i1_v32i1: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a0, zero, 32 +; LMULMAX8-NEXT: vsetvli a0, a0, e8,m2,ta,mu +; LMULMAX8-NEXT: vmand.mm v0, v0, v8 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v32i1_param_v32i1_v32i1: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi a0, zero, 32 +; LMULMAX4-NEXT: vsetvli a0, a0, e8,m2,ta,mu +; LMULMAX4-NEXT: vmand.mm v0, v0, v8 +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_v32i1_param_v32i1_v32i1: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a0, zero, 32 +; LMULMAX2-NEXT: vsetvli a0, a0, e8,m2,ta,mu +; LMULMAX2-NEXT: vmand.mm v0, v0, v8 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_v32i1_param_v32i1_v32i1: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a0, 16, e8,m1,ta,mu +; LMULMAX1-NEXT: vmand.mm v0, v0, v9 +; LMULMAX1-NEXT: vmand.mm v8, v8, v10 +; LMULMAX1-NEXT: ret + %r = and <32 x i1> %v, %w + ret <32 x i1> %r +} + +define <32 x i32> @ret_v32i32_param_v32i32_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %w) { +; LMULMAX8-LABEL: ret_v32i32_param_v32i32_v32i32_v32i32_i32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a2, zero, 32 +; LMULMAX8-NEXT: vsetvli a2, a2, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v24, (a0) +; LMULMAX8-NEXT: vadd.vv v8, v8, v16 +; LMULMAX8-NEXT: vadd.vv v8, v8, v24 +; LMULMAX8-NEXT: vadd.vx v8, v8, a1 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v32i32_param_v32i32_v32i32_v32i32_i32: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a1, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: addi a1, a0, 64 +; LMULMAX4-NEXT: vle32.v v28, (a1) +; LMULMAX4-NEXT: vle32.v v24, (a0) +; LMULMAX4-NEXT: vadd.vv v8, v8, v16 +; LMULMAX4-NEXT: vadd.vv v12, v12, v20 +; LMULMAX4-NEXT: vadd.vv v28, v12, v28 +; LMULMAX4-NEXT: vadd.vv v8, v8, v24 +; LMULMAX4-NEXT: vadd.vx v8, v8, a2 +; LMULMAX4-NEXT: vadd.vx v12, v28, a2 +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_v32i32_param_v32i32_v32i32_v32i32_i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: addi a1, a0, 32 +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: addi a1, a0, 64 +; LMULMAX2-NEXT: vle32.v v30, (a1) +; LMULMAX2-NEXT: addi a0, a0, 96 +; LMULMAX2-NEXT: vle32.v v24, (a0) +; LMULMAX2-NEXT: vadd.vv v8, v8, v16 +; LMULMAX2-NEXT: vadd.vv v10, v10, v18 +; LMULMAX2-NEXT: vadd.vv v12, v12, v20 +; LMULMAX2-NEXT: vadd.vv v14, v14, v22 +; LMULMAX2-NEXT: vadd.vv v14, v14, v24 +; LMULMAX2-NEXT: vadd.vv v30, v12, v30 +; LMULMAX2-NEXT: vadd.vv v28, v10, v28 +; LMULMAX2-NEXT: vadd.vv v26, v8, v26 +; LMULMAX2-NEXT: vadd.vx v8, v26, a4 +; LMULMAX2-NEXT: vadd.vx v10, v28, a4 +; LMULMAX2-NEXT: vadd.vx v12, v30, a4 +; LMULMAX2-NEXT: vadd.vx v14, v14, a4 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_v32i32_param_v32i32_v32i32_v32i32_i32: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v25, (a0) +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle32.v v26, (a1) +; LMULMAX1-NEXT: addi a1, a0, 32 +; LMULMAX1-NEXT: vle32.v v27, (a1) +; LMULMAX1-NEXT: addi a1, a0, 48 +; LMULMAX1-NEXT: vle32.v v28, (a1) +; LMULMAX1-NEXT: addi a1, a0, 64 +; LMULMAX1-NEXT: vle32.v v29, (a1) +; LMULMAX1-NEXT: addi a1, a0, 80 +; LMULMAX1-NEXT: vle32.v v30, (a1) +; LMULMAX1-NEXT: addi a1, a0, 96 +; LMULMAX1-NEXT: vle32.v v31, (a1) +; LMULMAX1-NEXT: addi a0, a0, 112 +; LMULMAX1-NEXT: vle32.v v24, (a0) +; LMULMAX1-NEXT: lw a0, 0(sp) +; LMULMAX1-NEXT: vadd.vv v8, v8, v16 +; LMULMAX1-NEXT: vadd.vv v9, v9, v17 +; LMULMAX1-NEXT: vadd.vv v10, v10, v18 +; LMULMAX1-NEXT: vadd.vv v11, v11, v19 +; LMULMAX1-NEXT: vadd.vv v12, v12, v20 +; LMULMAX1-NEXT: vadd.vv v13, v13, v21 +; LMULMAX1-NEXT: vadd.vv v14, v14, v22 +; LMULMAX1-NEXT: vadd.vv v15, v15, v23 +; LMULMAX1-NEXT: vadd.vv v15, v15, v24 +; LMULMAX1-NEXT: vadd.vv v31, v14, v31 +; LMULMAX1-NEXT: vadd.vv v30, v13, v30 +; LMULMAX1-NEXT: vadd.vv v29, v12, v29 +; LMULMAX1-NEXT: vadd.vv v28, v11, v28 +; LMULMAX1-NEXT: vadd.vv v27, v10, v27 +; LMULMAX1-NEXT: vadd.vv v26, v9, v26 +; LMULMAX1-NEXT: vadd.vv v25, v8, v25 +; LMULMAX1-NEXT: vadd.vx v8, v25, a0 +; LMULMAX1-NEXT: vadd.vx v9, v26, a0 +; LMULMAX1-NEXT: vadd.vx v10, v27, a0 +; LMULMAX1-NEXT: vadd.vx v11, v28, a0 +; LMULMAX1-NEXT: vadd.vx v12, v29, a0 +; LMULMAX1-NEXT: vadd.vx v13, v30, a0 +; LMULMAX1-NEXT: vadd.vx v14, v31, a0 +; LMULMAX1-NEXT: vadd.vx v15, v15, a0 +; LMULMAX1-NEXT: ret + %r = add <32 x i32> %x, %y + %s = add <32 x i32> %r, %z + %head = insertelement <32 x i32> undef, i32 %w, i32 0 + %splat = shufflevector <32 x i32> %head, <32 x i32> undef, <32 x i32> zeroinitializer + %t = add <32 x i32> %s, %splat + ret <32 x i32> %t +} + +declare <32 x i32> @ext2(<32 x i32>, <32 x i32>, i32, i32) +declare <32 x i32> @ext3(<32 x i32>, <32 x i32>, <32 x i32>, i32, i32) + +define <32 x i32> @ret_v32i32_call_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, i32 %w) { +; LMULMAX8-LABEL: ret_v32i32_call_v32i32_v32i32_i32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi sp, sp, -16 +; LMULMAX8-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX8-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: .cfi_offset ra, -8 +; LMULMAX8-NEXT: vmv8r.v v24, v8 +; LMULMAX8-NEXT: addi a1, zero, 2 +; LMULMAX8-NEXT: vmv8r.v v8, v16 +; LMULMAX8-NEXT: vmv8r.v v16, v24 +; LMULMAX8-NEXT: call ext2@plt +; LMULMAX8-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: addi sp, sp, 16 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v32i32_call_v32i32_v32i32_i32: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi sp, sp, -16 +; LMULMAX4-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX4-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: .cfi_offset ra, -8 +; LMULMAX4-NEXT: vmv4r.v v28, v12 +; LMULMAX4-NEXT: vmv4r.v v24, v8 +; LMULMAX4-NEXT: addi a1, zero, 2 +; LMULMAX4-NEXT: vmv4r.v v8, v16 +; LMULMAX4-NEXT: vmv4r.v v12, v20 +; LMULMAX4-NEXT: vmv4r.v v16, v24 +; LMULMAX4-NEXT: vmv4r.v v20, v28 +; LMULMAX4-NEXT: call ext2@plt +; LMULMAX4-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: addi sp, sp, 16 +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_v32i32_call_v32i32_v32i32_i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi sp, sp, -16 +; LMULMAX2-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; LMULMAX2-NEXT: .cfi_offset ra, -8 +; LMULMAX2-NEXT: vmv2r.v v26, v14 +; LMULMAX2-NEXT: vmv2r.v v28, v12 +; LMULMAX2-NEXT: vmv2r.v v30, v10 +; LMULMAX2-NEXT: vmv2r.v v24, v8 +; LMULMAX2-NEXT: addi a1, zero, 2 +; LMULMAX2-NEXT: vmv2r.v v8, v16 +; LMULMAX2-NEXT: vmv2r.v v10, v18 +; LMULMAX2-NEXT: vmv2r.v v12, v20 +; LMULMAX2-NEXT: vmv2r.v v14, v22 +; LMULMAX2-NEXT: vmv2r.v v16, v24 +; LMULMAX2-NEXT: vmv2r.v v18, v30 +; LMULMAX2-NEXT: vmv2r.v v20, v28 +; LMULMAX2-NEXT: vmv2r.v v22, v26 +; LMULMAX2-NEXT: call ext2@plt +; LMULMAX2-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; LMULMAX2-NEXT: addi sp, sp, 16 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_v32i32_call_v32i32_v32i32_i32: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi sp, sp, -16 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; LMULMAX1-NEXT: .cfi_offset ra, -8 +; LMULMAX1-NEXT: vmv1r.v v25, v15 +; LMULMAX1-NEXT: vmv1r.v v26, v14 +; LMULMAX1-NEXT: vmv1r.v v27, v13 +; LMULMAX1-NEXT: vmv1r.v v28, v12 +; LMULMAX1-NEXT: vmv1r.v v29, v11 +; LMULMAX1-NEXT: vmv1r.v v30, v10 +; LMULMAX1-NEXT: vmv1r.v v31, v9 +; LMULMAX1-NEXT: vmv1r.v v24, v8 +; LMULMAX1-NEXT: addi a1, zero, 2 +; LMULMAX1-NEXT: vmv1r.v v8, v16 +; LMULMAX1-NEXT: vmv1r.v v9, v17 +; LMULMAX1-NEXT: vmv1r.v v10, v18 +; LMULMAX1-NEXT: vmv1r.v v11, v19 +; LMULMAX1-NEXT: vmv1r.v v12, v20 +; LMULMAX1-NEXT: vmv1r.v v13, v21 +; LMULMAX1-NEXT: vmv1r.v v14, v22 +; LMULMAX1-NEXT: vmv1r.v v15, v23 +; LMULMAX1-NEXT: vmv1r.v v16, v24 +; LMULMAX1-NEXT: vmv1r.v v17, v31 +; LMULMAX1-NEXT: vmv1r.v v18, v30 +; LMULMAX1-NEXT: vmv1r.v v19, v29 +; LMULMAX1-NEXT: vmv1r.v v20, v28 +; LMULMAX1-NEXT: vmv1r.v v21, v27 +; LMULMAX1-NEXT: vmv1r.v v22, v26 +; LMULMAX1-NEXT: vmv1r.v v23, v25 +; LMULMAX1-NEXT: call ext2@plt +; LMULMAX1-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; LMULMAX1-NEXT: addi sp, sp, 16 +; LMULMAX1-NEXT: ret + %t = call <32 x i32> @ext2(<32 x i32> %y, <32 x i32> %x, i32 %w, i32 2) + ret <32 x i32> %t +} + +define <32 x i32> @ret_v32i32_call_v32i32_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %w) { +; LMULMAX8-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi sp, sp, -256 +; LMULMAX8-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX8-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: .cfi_offset ra, -8 +; LMULMAX8-NEXT: .cfi_offset s0, -16 +; LMULMAX8-NEXT: addi s0, sp, 256 +; LMULMAX8-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX8-NEXT: andi sp, sp, -128 +; LMULMAX8-NEXT: addi a2, zero, 32 +; LMULMAX8-NEXT: vsetvli a2, a2, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v24, (a0) +; LMULMAX8-NEXT: mv a0, sp +; LMULMAX8-NEXT: addi a2, zero, 42 +; LMULMAX8-NEXT: vse32.v v8, (sp) +; LMULMAX8-NEXT: vmv8r.v v8, v24 +; LMULMAX8-NEXT: call ext3@plt +; LMULMAX8-NEXT: addi sp, s0, -256 +; LMULMAX8-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: addi sp, sp, 256 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi sp, sp, -256 +; LMULMAX4-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX4-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: .cfi_offset ra, -8 +; LMULMAX4-NEXT: .cfi_offset s0, -16 +; LMULMAX4-NEXT: addi s0, sp, 256 +; LMULMAX4-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX4-NEXT: andi sp, sp, -128 +; LMULMAX4-NEXT: vsetivli a1, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vle32.v v28, (a0) +; LMULMAX4-NEXT: addi a0, a0, 64 +; LMULMAX4-NEXT: vle32.v v24, (a0) +; LMULMAX4-NEXT: addi a0, sp, 64 +; LMULMAX4-NEXT: vse32.v v12, (a0) +; LMULMAX4-NEXT: mv a0, sp +; LMULMAX4-NEXT: addi a3, zero, 42 +; LMULMAX4-NEXT: vse32.v v8, (sp) +; LMULMAX4-NEXT: vmv4r.v v8, v28 +; LMULMAX4-NEXT: vmv4r.v v12, v24 +; LMULMAX4-NEXT: call ext3@plt +; LMULMAX4-NEXT: addi sp, s0, -256 +; LMULMAX4-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: addi sp, sp, 256 +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi sp, sp, -256 +; LMULMAX2-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX2-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX2-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX2-NEXT: .cfi_offset ra, -8 +; LMULMAX2-NEXT: .cfi_offset s0, -16 +; LMULMAX2-NEXT: addi s0, sp, 256 +; LMULMAX2-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX2-NEXT: andi sp, sp, -128 +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: addi a1, a0, 32 +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: addi a1, a0, 64 +; LMULMAX2-NEXT: vle32.v v30, (a1) +; LMULMAX2-NEXT: addi a0, a0, 96 +; LMULMAX2-NEXT: vle32.v v24, (a0) +; LMULMAX2-NEXT: addi a0, sp, 96 +; LMULMAX2-NEXT: vse32.v v14, (a0) +; LMULMAX2-NEXT: addi a0, sp, 64 +; LMULMAX2-NEXT: vse32.v v12, (a0) +; LMULMAX2-NEXT: addi a0, sp, 32 +; LMULMAX2-NEXT: vse32.v v10, (a0) +; LMULMAX2-NEXT: mv a0, sp +; LMULMAX2-NEXT: addi a5, zero, 42 +; LMULMAX2-NEXT: vse32.v v8, (sp) +; LMULMAX2-NEXT: vmv2r.v v8, v26 +; LMULMAX2-NEXT: vmv2r.v v10, v28 +; LMULMAX2-NEXT: vmv2r.v v12, v30 +; LMULMAX2-NEXT: vmv2r.v v14, v24 +; LMULMAX2-NEXT: call ext3@plt +; LMULMAX2-NEXT: addi sp, s0, -256 +; LMULMAX2-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX2-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX2-NEXT: addi sp, sp, 256 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi sp, sp, -384 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 384 +; LMULMAX1-NEXT: sd ra, 376(sp) # 8-byte Folded Spill +; LMULMAX1-NEXT: sd s0, 368(sp) # 8-byte Folded Spill +; LMULMAX1-NEXT: .cfi_offset ra, -8 +; LMULMAX1-NEXT: .cfi_offset s0, -16 +; LMULMAX1-NEXT: addi s0, sp, 384 +; LMULMAX1-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX1-NEXT: andi sp, sp, -128 +; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v25, (a0) +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle32.v v26, (a1) +; LMULMAX1-NEXT: addi a1, a0, 32 +; LMULMAX1-NEXT: vle32.v v27, (a1) +; LMULMAX1-NEXT: addi a1, a0, 48 +; LMULMAX1-NEXT: vle32.v v28, (a1) +; LMULMAX1-NEXT: addi a1, a0, 64 +; LMULMAX1-NEXT: vle32.v v29, (a1) +; LMULMAX1-NEXT: addi a1, a0, 80 +; LMULMAX1-NEXT: vle32.v v30, (a1) +; LMULMAX1-NEXT: addi a1, a0, 96 +; LMULMAX1-NEXT: vle32.v v31, (a1) +; LMULMAX1-NEXT: addi a0, a0, 112 +; LMULMAX1-NEXT: vle32.v v24, (a0) +; LMULMAX1-NEXT: ld a1, 0(s0) +; LMULMAX1-NEXT: addi a0, sp, 240 +; LMULMAX1-NEXT: vse32.v v15, (a0) +; LMULMAX1-NEXT: addi a0, sp, 224 +; LMULMAX1-NEXT: vse32.v v14, (a0) +; LMULMAX1-NEXT: addi a0, sp, 208 +; LMULMAX1-NEXT: vse32.v v13, (a0) +; LMULMAX1-NEXT: addi a0, sp, 192 +; LMULMAX1-NEXT: vse32.v v12, (a0) +; LMULMAX1-NEXT: addi a0, sp, 176 +; LMULMAX1-NEXT: vse32.v v11, (a0) +; LMULMAX1-NEXT: addi a0, sp, 160 +; LMULMAX1-NEXT: vse32.v v10, (a0) +; LMULMAX1-NEXT: addi a0, sp, 144 +; LMULMAX1-NEXT: vse32.v v9, (a0) +; LMULMAX1-NEXT: addi a0, sp, 128 +; LMULMAX1-NEXT: vse32.v v8, (a0) +; LMULMAX1-NEXT: addi a0, zero, 42 +; LMULMAX1-NEXT: sd a0, 8(sp) +; LMULMAX1-NEXT: addi a0, sp, 128 +; LMULMAX1-NEXT: sd a1, 0(sp) +; LMULMAX1-NEXT: vmv1r.v v8, v25 +; LMULMAX1-NEXT: vmv1r.v v9, v26 +; LMULMAX1-NEXT: vmv1r.v v10, v27 +; LMULMAX1-NEXT: vmv1r.v v11, v28 +; LMULMAX1-NEXT: vmv1r.v v12, v29 +; LMULMAX1-NEXT: vmv1r.v v13, v30 +; LMULMAX1-NEXT: vmv1r.v v14, v31 +; LMULMAX1-NEXT: vmv1r.v v15, v24 +; LMULMAX1-NEXT: call ext3@plt +; LMULMAX1-NEXT: addi sp, s0, -384 +; LMULMAX1-NEXT: ld s0, 368(sp) # 8-byte Folded Reload +; LMULMAX1-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; LMULMAX1-NEXT: addi sp, sp, 384 +; LMULMAX1-NEXT: ret + %t = call <32 x i32> @ext3(<32 x i32> %z, <32 x i32> %y, <32 x i32> %x, i32 %w, i32 42) + ret <32 x i32> %t +} + +; Test various configurations of split vector types where the values are split +; across both registers and the stack. +; LMUL8: Ins: v8,v9,v10,v11,v12, v16m8 y[0:31], a0+0 z[0:31] +; LMUL4: Ins: v8,v9,v10,v11,v12, v16m4 y[0:15], v20m4 y[16:31], a0+0 z[0:15], +; a0+64 z[16:31] +; LMUL2: Ins: v8,v9,v10,v11,v12, v14m2 y[0:7], v16m2 y[8:15], v18m2 y[16:23], +; v20m2 y[24:31], v22m2 z[0:7], a1+0 z[8:15], a1+32 z[16:23], +; a1+64 z[24:31] +; LMUL1: Ins: v8,v9,v10,v11,v12, v13 y[0:3], v14 y[4:7], v15 y[8:11], +; v16 y[12:15], v17 y[16:19], v18 y[20:23], v19 y[24:27], +; v20 y[28:31], v21 z[0:3], v22 z[4:7], v23 z[8:11], +; a1+0 z[12:15], a1+16 z[16:19], a1+32 z[20:23], a1+48 z[24:27], +; a1+64 z[28:31] +define <32 x i32> @split_vector_args(<2 x i32>,<2 x i32>,<2 x i32>,<2 x i32>,<2 x i32>, <32 x i32> %y, <32 x i32> %z) { +; LMULMAX8-LABEL: split_vector_args: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a1, zero, 32 +; LMULMAX8-NEXT: vsetvli a1, a1, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v8, (a0) +; LMULMAX8-NEXT: vadd.vv v8, v16, v8 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: split_vector_args: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi a1, a0, 64 +; LMULMAX4-NEXT: vsetivli a2, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vle32.v v28, (a0) +; LMULMAX4-NEXT: vle32.v v12, (a1) +; LMULMAX4-NEXT: vadd.vv v8, v16, v28 +; LMULMAX4-NEXT: vadd.vv v12, v20, v12 +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: split_vector_args: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a1, a0, 64 +; LMULMAX2-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: addi a2, a0, 32 +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a2) +; LMULMAX2-NEXT: vle32.v v30, (a1) +; LMULMAX2-NEXT: vadd.vv v8, v14, v22 +; LMULMAX2-NEXT: vadd.vv v10, v16, v26 +; LMULMAX2-NEXT: vadd.vv v12, v18, v28 +; LMULMAX2-NEXT: vadd.vv v14, v20, v30 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: split_vector_args: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a1, a0, 64 +; LMULMAX1-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v25, (a1) +; LMULMAX1-NEXT: addi a1, a0, 48 +; LMULMAX1-NEXT: vle32.v v26, (a1) +; LMULMAX1-NEXT: addi a1, a0, 32 +; LMULMAX1-NEXT: vle32.v v27, (a1) +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle32.v v28, (a1) +; LMULMAX1-NEXT: vle32.v v29, (a0) +; LMULMAX1-NEXT: vadd.vv v8, v13, v21 +; LMULMAX1-NEXT: vadd.vv v9, v14, v22 +; LMULMAX1-NEXT: vadd.vv v10, v15, v23 +; LMULMAX1-NEXT: vadd.vv v11, v16, v29 +; LMULMAX1-NEXT: vadd.vv v12, v17, v28 +; LMULMAX1-NEXT: vadd.vv v13, v18, v27 +; LMULMAX1-NEXT: vadd.vv v14, v19, v26 +; LMULMAX1-NEXT: vadd.vv v15, v20, v25 +; LMULMAX1-NEXT: ret + %v0 = add <32 x i32> %y, %z + ret <32 x i32> %v0 +} + +define <32 x i32> @call_split_vector_args(<2 x i32>* %pa, <32 x i32>* %pb) { +; LMULMAX8-LABEL: call_split_vector_args: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi sp, sp, -256 +; LMULMAX8-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX8-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: .cfi_offset ra, -8 +; LMULMAX8-NEXT: .cfi_offset s0, -16 +; LMULMAX8-NEXT: addi s0, sp, 256 +; LMULMAX8-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX8-NEXT: andi sp, sp, -128 +; LMULMAX8-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; LMULMAX8-NEXT: vle32.v v8, (a0) +; LMULMAX8-NEXT: addi a0, zero, 32 +; LMULMAX8-NEXT: vsetvli a0, a0, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v16, (a1) +; LMULMAX8-NEXT: mv a0, sp +; LMULMAX8-NEXT: vse32.v v16, (sp) +; LMULMAX8-NEXT: vmv1r.v v9, v8 +; LMULMAX8-NEXT: vmv1r.v v10, v8 +; LMULMAX8-NEXT: vmv1r.v v11, v8 +; LMULMAX8-NEXT: vmv1r.v v12, v8 +; LMULMAX8-NEXT: call split_vector_args@plt +; LMULMAX8-NEXT: addi sp, s0, -256 +; LMULMAX8-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: addi sp, sp, 256 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: call_split_vector_args: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi sp, sp, -256 +; LMULMAX4-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX4-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: .cfi_offset ra, -8 +; LMULMAX4-NEXT: .cfi_offset s0, -16 +; LMULMAX4-NEXT: addi s0, sp, 256 +; LMULMAX4-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX4-NEXT: andi sp, sp, -128 +; LMULMAX4-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; LMULMAX4-NEXT: vle32.v v8, (a0) +; LMULMAX4-NEXT: vsetivli a0, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vle32.v v16, (a1) +; LMULMAX4-NEXT: addi a0, a1, 64 +; LMULMAX4-NEXT: vle32.v v20, (a0) +; LMULMAX4-NEXT: addi a0, sp, 64 +; LMULMAX4-NEXT: vse32.v v20, (a0) +; LMULMAX4-NEXT: mv a0, sp +; LMULMAX4-NEXT: vse32.v v16, (sp) +; LMULMAX4-NEXT: vmv1r.v v9, v8 +; LMULMAX4-NEXT: vmv1r.v v10, v8 +; LMULMAX4-NEXT: vmv1r.v v11, v8 +; LMULMAX4-NEXT: vmv1r.v v12, v8 +; LMULMAX4-NEXT: call split_vector_args@plt +; LMULMAX4-NEXT: addi sp, s0, -256 +; LMULMAX4-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: addi sp, sp, 256 +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: call_split_vector_args: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi sp, sp, -256 +; LMULMAX2-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX2-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX2-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX2-NEXT: .cfi_offset ra, -8 +; LMULMAX2-NEXT: .cfi_offset s0, -16 +; LMULMAX2-NEXT: addi s0, sp, 256 +; LMULMAX2-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX2-NEXT: andi sp, sp, -128 +; LMULMAX2-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; LMULMAX2-NEXT: vle32.v v8, (a0) +; LMULMAX2-NEXT: vsetivli a0, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v14, (a1) +; LMULMAX2-NEXT: addi a0, a1, 32 +; LMULMAX2-NEXT: vle32.v v16, (a0) +; LMULMAX2-NEXT: addi a0, a1, 64 +; LMULMAX2-NEXT: vle32.v v18, (a0) +; LMULMAX2-NEXT: addi a0, a1, 96 +; LMULMAX2-NEXT: vle32.v v20, (a0) +; LMULMAX2-NEXT: addi a0, sp, 64 +; LMULMAX2-NEXT: vse32.v v20, (a0) +; LMULMAX2-NEXT: addi a0, sp, 32 +; LMULMAX2-NEXT: vse32.v v18, (a0) +; LMULMAX2-NEXT: mv a0, sp +; LMULMAX2-NEXT: vse32.v v16, (sp) +; LMULMAX2-NEXT: vmv1r.v v9, v8 +; LMULMAX2-NEXT: vmv1r.v v10, v8 +; LMULMAX2-NEXT: vmv1r.v v11, v8 +; LMULMAX2-NEXT: vmv1r.v v12, v8 +; LMULMAX2-NEXT: vmv2r.v v22, v14 +; LMULMAX2-NEXT: call split_vector_args@plt +; LMULMAX2-NEXT: addi sp, s0, -256 +; LMULMAX2-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX2-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX2-NEXT: addi sp, sp, 256 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: call_split_vector_args: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi sp, sp, -256 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX1-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX1-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX1-NEXT: .cfi_offset ra, -8 +; LMULMAX1-NEXT: .cfi_offset s0, -16 +; LMULMAX1-NEXT: addi s0, sp, 256 +; LMULMAX1-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX1-NEXT: andi sp, sp, -128 +; LMULMAX1-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: vsetivli a0, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v13, (a1) +; LMULMAX1-NEXT: addi a0, a1, 16 +; LMULMAX1-NEXT: vle32.v v14, (a0) +; LMULMAX1-NEXT: addi a0, a1, 32 +; LMULMAX1-NEXT: vle32.v v15, (a0) +; LMULMAX1-NEXT: addi a0, a1, 48 +; LMULMAX1-NEXT: vle32.v v16, (a0) +; LMULMAX1-NEXT: addi a0, a1, 64 +; LMULMAX1-NEXT: vle32.v v17, (a0) +; LMULMAX1-NEXT: addi a0, a1, 80 +; LMULMAX1-NEXT: vle32.v v18, (a0) +; LMULMAX1-NEXT: addi a0, a1, 96 +; LMULMAX1-NEXT: vle32.v v19, (a0) +; LMULMAX1-NEXT: addi a0, a1, 112 +; LMULMAX1-NEXT: vle32.v v20, (a0) +; LMULMAX1-NEXT: addi a0, sp, 64 +; LMULMAX1-NEXT: vse32.v v20, (a0) +; LMULMAX1-NEXT: addi a0, sp, 48 +; LMULMAX1-NEXT: vse32.v v19, (a0) +; LMULMAX1-NEXT: addi a0, sp, 32 +; LMULMAX1-NEXT: vse32.v v18, (a0) +; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: vse32.v v17, (a0) +; LMULMAX1-NEXT: mv a0, sp +; LMULMAX1-NEXT: vse32.v v16, (sp) +; LMULMAX1-NEXT: vmv1r.v v9, v8 +; LMULMAX1-NEXT: vmv1r.v v10, v8 +; LMULMAX1-NEXT: vmv1r.v v11, v8 +; LMULMAX1-NEXT: vmv1r.v v12, v8 +; LMULMAX1-NEXT: vmv1r.v v21, v13 +; LMULMAX1-NEXT: vmv1r.v v22, v14 +; LMULMAX1-NEXT: vmv1r.v v23, v15 +; LMULMAX1-NEXT: call split_vector_args@plt +; LMULMAX1-NEXT: addi sp, s0, -256 +; LMULMAX1-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX1-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX1-NEXT: addi sp, sp, 256 +; LMULMAX1-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %pa + %b = load <32 x i32>, <32 x i32>* %pb + %r = call <32 x i32> @split_vector_args(<2 x i32> %a, <2 x i32> %a, <2 x i32> %a, <2 x i32> %a, <2 x i32> %a, <32 x i32> %b, <32 x i32> %b) + ret <32 x i32> %r +}