From fa8bb224661dfb38cb2a246f7d98dc61fd45602e Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 20 Jan 2022 14:16:37 -0800 Subject: [PATCH] [RISCV] Optimize vector_shuffles that are interleaving the lowest elements of two vectors. RISCV only has a unary shuffle that requires places indices in a register. For interleaving two vectors this means we need at least two vrgathers and a vmerge to do a shuffle of two vectors. This patch teaches shuffle lowering to use a widening addu followed by a widening vmaccu to implement the interleave. First we extract the low half of both V1 and V2. Then we implement (zext(V1) + zext(V2)) + (zext(V2) * zext(2^eltbits - 1)) which simplifies to (zext(V1) + zext(V2) * 2^eltbits). This further simplifies to (zext(V1) + zext(V2) << eltbits). Then we bitcast the result back to the original type splitting the wide elements in half. We can only do this if we have a type with wider elements available. Because we're using extends we also have to be careful with fractional lmuls. Floating point types are supported by bitcasting to/from integer. The tests test a varied combination of LMULs split across VLEN>=128 and VLEN>=512 tests. There a few tests with shuffle indices commuted as well as tests for undef indices. There's one test for a vXi64/vXf64 vector which we can't optimize, but verifies we don't crash. Reviewed By: rogfer01 Differential Revision: https://reviews.llvm.org/D117743 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 122 +++++- llvm/lib/Target/RISCV/RISCVISelLowering.h | 1 + llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td | 10 +- .../RISCV/rvv/fixed-vectors-fp-interleave.ll | 378 ++++++++++++++++ .../RISCV/rvv/fixed-vectors-int-interleave.ll | 484 +++++++++++++++++++++ .../RISCV/rvv/fixed-vectors-int-shuffles.ll | 15 +- 6 files changed, 995 insertions(+), 15 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index f942f39..507a21b 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2328,6 +2328,48 @@ static int matchShuffleAsSlideDown(ArrayRef Mask) { return -1; } +static bool isInterleaveShuffle(ArrayRef Mask, MVT VT, bool &SwapSources, + const RISCVSubtarget &Subtarget) { + // We need to be able to widen elements to the next larger integer type. + if (VT.getScalarSizeInBits() >= Subtarget.getMaxELENForFixedLengthVectors()) + return false; + + int Size = Mask.size(); + assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); + + int Srcs[] = {-1, -1}; + for (int i = 0; i != Size; ++i) { + // Ignore undef elements. + if (Mask[i] < 0) + continue; + + // Is this an even or odd element. + int Pol = i % 2; + + // Ensure we consistently use the same source for this element polarity. + int Src = Mask[i] / Size; + if (Srcs[Pol] < 0) + Srcs[Pol] = Src; + if (Srcs[Pol] != Src) + return false; + + // Make sure the element within the source is appropriate for this element + // in the destination. + int Elt = Mask[i] % Size; + if (Elt != i / 2) + return false; + } + + // We need to find a source for each polarity and they can't be the same. + if (Srcs[0] < 0 || Srcs[1] < 0 || Srcs[0] == Srcs[1]) + return false; + + // Swap the sources if the second source was in the even polarity. + SwapSources = Srcs[0] > Srcs[1]; + + return true; +} + static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { SDValue V1 = Op.getOperand(0); @@ -2413,8 +2455,10 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, } } + ArrayRef Mask = SVN->getMask(); + // Try to match as a slidedown. - int SlideAmt = matchShuffleAsSlideDown(SVN->getMask()); + int SlideAmt = matchShuffleAsSlideDown(Mask); if (SlideAmt >= 0) { // TODO: Should we reduce the VL to account for the upper undef elements? // Requires additional vsetvlis, but might be faster to execute. @@ -2427,10 +2471,81 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, return convertFromScalableVector(VT, SlideDown, DAG, Subtarget); } + // Detect an interleave shuffle and lower to + // (vmaccu.vx (vwaddu.vx lohalf(V1), lohalf(V2)), lohalf(V2), (2^eltbits - 1)) + bool SwapSources; + if (isInterleaveShuffle(Mask, VT, SwapSources, Subtarget)) { + // Swap sources if needed. + if (SwapSources) + std::swap(V1, V2); + + // Extract the lower half of the vectors. + MVT HalfVT = VT.getHalfNumVectorElementsVT(); + V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getConstant(0, DL, XLenVT)); + V2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V2, + DAG.getConstant(0, DL, XLenVT)); + + // Double the element width and halve the number of elements in an int type. + unsigned EltBits = VT.getScalarSizeInBits(); + MVT WideIntEltVT = MVT::getIntegerVT(EltBits * 2); + MVT WideIntVT = + MVT::getVectorVT(WideIntEltVT, VT.getVectorNumElements() / 2); + // Convert this to a scalable vector. We need to base this on the + // destination size to ensure there's always a type with a smaller LMUL. + MVT WideIntContainerVT = + getContainerForFixedLengthVector(DAG, WideIntVT, Subtarget); + + // Convert sources to scalable vectors with the same element count as the + // larger type. + MVT HalfContainerVT = MVT::getVectorVT( + VT.getVectorElementType(), WideIntContainerVT.getVectorElementCount()); + V1 = convertToScalableVector(HalfContainerVT, V1, DAG, Subtarget); + V2 = convertToScalableVector(HalfContainerVT, V2, DAG, Subtarget); + + // Cast sources to integer. + MVT IntEltVT = MVT::getIntegerVT(EltBits); + MVT IntHalfVT = + MVT::getVectorVT(IntEltVT, HalfContainerVT.getVectorElementCount()); + V1 = DAG.getBitcast(IntHalfVT, V1); + V2 = DAG.getBitcast(IntHalfVT, V2); + + // Freeze V2 since we use it twice and we need to be sure that the add and + // multiply see the same value. + V2 = DAG.getNode(ISD::FREEZE, DL, IntHalfVT, V2); + + // Recreate TrueMask using the widened type's element count. + MVT MaskVT = + MVT::getVectorVT(MVT::i1, HalfContainerVT.getVectorElementCount()); + TrueMask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL); + + // Widen V1 and V2 with 0s and add one copy of V2 to V1. + SDValue Add = DAG.getNode(RISCVISD::VWADDU_VL, DL, WideIntContainerVT, V1, + V2, TrueMask, VL); + // Create 2^eltbits - 1 copies of V2 by multiplying by the largest integer. + SDValue Multiplier = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntHalfVT, + DAG.getAllOnesConstant(DL, XLenVT)); + SDValue WidenMul = DAG.getNode(RISCVISD::VWMULU_VL, DL, WideIntContainerVT, + V2, Multiplier, TrueMask, VL); + // Add the new copies to our previous addition giving us 2^eltbits copies of + // V2. This is equivalent to shifting V2 left by eltbits. This should + // combine with the vwmulu.vv above to form vwmaccu.vv. + Add = DAG.getNode(RISCVISD::ADD_VL, DL, WideIntContainerVT, Add, WidenMul, + TrueMask, VL); + // Cast back to ContainerVT. We need to re-create a new ContainerVT in case + // WideIntContainerVT is a larger fractional LMUL than implied by the fixed + // vector VT. + ContainerVT = + MVT::getVectorVT(VT.getVectorElementType(), + WideIntContainerVT.getVectorElementCount() * 2); + Add = DAG.getBitcast(ContainerVT, Add); + return convertFromScalableVector(VT, Add, DAG, Subtarget); + } + // Detect shuffles which can be re-expressed as vector selects; these are // shuffles in which each element in the destination is taken from an element // at the corresponding index in either source vectors. - bool IsSelect = all_of(enumerate(SVN->getMask()), [&](const auto &MaskIdx) { + bool IsSelect = all_of(enumerate(Mask), [&](const auto &MaskIdx) { int MaskIndex = MaskIdx.value(); return MaskIndex < 0 || MaskIdx.index() == (unsigned)MaskIndex % NumElts; }); @@ -2456,7 +2571,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, // Now construct the mask that will be used by the vselect or blended // vrgather operation. For vrgathers, construct the appropriate indices into // each vector. - for (int MaskIndex : SVN->getMask()) { + for (int MaskIndex : Mask) { bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ InvertMask; MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT)); if (!IsSelect) { @@ -9941,6 +10056,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FP_ROUND_VL) NODE_NAME_CASE(VWMUL_VL) NODE_NAME_CASE(VWMULU_VL) + NODE_NAME_CASE(VWADDU_VL) NODE_NAME_CASE(SETCC_VL) NODE_NAME_CASE(VSELECT_VL) NODE_NAME_CASE(VMAND_VL) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 76b7788..23857f9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -245,6 +245,7 @@ enum NodeType : unsigned { // Widening instructions VWMUL_VL, VWMULU_VL, + VWADDU_VL, // Vector compare producing a mask. Fourth operand is input mask. Fifth // operand is VL. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index f646a46..9745c138 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -221,14 +221,15 @@ def riscv_trunc_vector_vl : SDNode<"RISCVISD::TRUNCATE_VECTOR_VL", SDTCVecEltisVT<2, i1>, SDTCisVT<3, XLenVT>]>>; -def SDT_RISCVVWMUL_VL : SDTypeProfile<1, 4, [SDTCisVec<0>, +def SDT_RISCVVWBinOp_VL : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameNumEltsAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameNumEltsAs<1, 3>, SDTCVecEltisVT<3, i1>, SDTCisVT<4, XLenVT>]>; -def riscv_vwmul_vl : SDNode<"RISCVISD::VWMUL_VL", SDT_RISCVVWMUL_VL, [SDNPCommutative]>; -def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWMUL_VL, [SDNPCommutative]>; +def riscv_vwmul_vl : SDNode<"RISCVISD::VWMUL_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; +def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; +def riscv_vwaddu_vl : SDNode<"RISCVISD::VWADDU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; def SDTRVVVecReduce : SDTypeProfile<1, 5, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisSameAs<0, 3>, @@ -712,6 +713,9 @@ foreach vti = AllIntegerVectors in { (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } +// 12.2. Vector Widening Integer Add/Subtract +defm : VPatBinaryWVL_VV_VX; + // 12.3. Vector Integer Extension defm : VPatExtendSDNode_V_VL; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll new file mode 100644 index 0000000..3f57fce --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -0,0 +1,378 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v,+zfh -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV32-V128 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v,+zfh -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV64-V128 +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v,+zfh -riscv-v-vector-bits-min=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV32-V512 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v,+zfh -riscv-v-vector-bits-min=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV64-V512 + +; Test optimizing interleaves to widening arithmetic. + +define <4 x half> @interleave_v2f16(<2 x half> %x, <2 x half> %y) { +; CHECK-LABEL: interleave_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf4, ta, mu +; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %a = shufflevector <2 x half> %x, <2 x half> %y, <4 x i32> + ret <4 x half> %a +} + +; Vector order switched for coverage. +define <4 x float> @interleave_v2f32(<2 x float> %x, <2 x float> %y) { +; CHECK-LABEL: interleave_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, mf2, ta, mu +; CHECK-NEXT: vwaddu.vv v10, v9, v8 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v10, a0, v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %a = shufflevector <2 x float> %x, <2 x float> %y, <4 x i32> + ret <4 x float> %a +} + +; One vXf64 test case to very that we don't optimize it. +; FIXME: Is there better codegen we can do here? +define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) { +; RV32-V128-LABEL: interleave_v2f64: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: vmv1r.v v12, v9 +; RV32-V128-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; RV32-V128-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV32-V128-NEXT: vid.v v10 +; RV32-V128-NEXT: vsrl.vi v14, v10, 1 +; RV32-V128-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-V128-NEXT: vrgatherei16.vv v10, v8, v14 +; RV32-V128-NEXT: li a0, 10 +; RV32-V128-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV32-V128-NEXT: vmv.s.x v0, a0 +; RV32-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32-V128-NEXT: vrgatherei16.vv v10, v12, v14, v0.t +; RV32-V128-NEXT: vmv.v.v v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V128-LABEL: interleave_v2f64: +; RV64-V128: # %bb.0: +; RV64-V128-NEXT: vmv1r.v v12, v9 +; RV64-V128-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64-V128-NEXT: vid.v v10 +; RV64-V128-NEXT: vsrl.vi v14, v10, 1 +; RV64-V128-NEXT: vrgather.vv v10, v8, v14 +; RV64-V128-NEXT: li a0, 10 +; RV64-V128-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV64-V128-NEXT: vmv.s.x v0, a0 +; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64-V128-NEXT: vrgather.vv v10, v12, v14, v0.t +; RV64-V128-NEXT: vmv.v.v v8, v10 +; RV64-V128-NEXT: ret +; +; RV32-V512-LABEL: interleave_v2f64: +; RV32-V512: # %bb.0: +; RV32-V512-NEXT: vsetivli zero, 4, e16, mf4, ta, mu +; RV32-V512-NEXT: vid.v v10 +; RV32-V512-NEXT: vsrl.vi v11, v10, 1 +; RV32-V512-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-V512-NEXT: vrgatherei16.vv v10, v8, v11 +; RV32-V512-NEXT: li a0, 10 +; RV32-V512-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV32-V512-NEXT: vmv.s.x v0, a0 +; RV32-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu +; RV32-V512-NEXT: vrgatherei16.vv v10, v9, v11, v0.t +; RV32-V512-NEXT: vmv.v.v v8, v10 +; RV32-V512-NEXT: ret +; +; RV64-V512-LABEL: interleave_v2f64: +; RV64-V512: # %bb.0: +; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu +; RV64-V512-NEXT: vid.v v10 +; RV64-V512-NEXT: vsrl.vi v11, v10, 1 +; RV64-V512-NEXT: vrgather.vv v10, v8, v11 +; RV64-V512-NEXT: li a0, 10 +; RV64-V512-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV64-V512-NEXT: vmv.s.x v0, a0 +; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu +; RV64-V512-NEXT: vrgather.vv v10, v9, v11, v0.t +; RV64-V512-NEXT: vmv.v.v v8, v10 +; RV64-V512-NEXT: ret + %a = shufflevector <2 x double> %x, <2 x double> %y, <4 x i32> + ret <4 x double> %a +} + +; Undef elements for coverage +define <8 x half> @interleave_v4f16(<4 x half> %x, <4 x half> %y) { +; V128-LABEL: interleave_v4f16: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; V128-NEXT: vwaddu.vv v10, v8, v9 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v10, a0, v9 +; V128-NEXT: vmv1r.v v8, v10 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v4f16: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, mu +; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vmv1r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <4 x half> %x, <4 x half> %y, <8 x i32> + ret <8 x half> %a +} + +define <8 x float> @interleave_v4f32(<4 x float> %x, <4 x float> %y) { +; V128-LABEL: interleave_v4f32: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 8, e32, m1, ta, mu +; V128-NEXT: vwaddu.vv v10, v8, v9 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v10, a0, v9 +; V128-NEXT: vmv2r.v v8, v10 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v4f32: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, mu +; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vmv1r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <4 x float> %x, <4 x float> %y, <8 x i32> + ret <8 x float> %a +} + +; Vector order switched for coverage. +define <16 x half> @interleave_v8f16(<8 x half> %x, <8 x half> %y) { +; V128-LABEL: interleave_v8f16: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 16, e16, m1, ta, mu +; V128-NEXT: vwaddu.vv v10, v9, v8 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v10, a0, v8 +; V128-NEXT: vmv2r.v v8, v10 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v8f16: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 16, e16, mf4, ta, mu +; V512-NEXT: vwaddu.vv v10, v9, v8 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v8 +; V512-NEXT: vmv1r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <8 x half> %x, <8 x half> %y, <16 x i32> + ret <16 x half> %a +} + +define <16 x float> @interleave_v8f32(<8 x float> %x, <8 x float> %y) { +; V128-LABEL: interleave_v8f32: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 16, e32, m2, ta, mu +; V128-NEXT: vwaddu.vv v12, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v12, a0, v10 +; V128-NEXT: vmv4r.v v8, v12 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v8f32: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 16, e32, mf2, ta, mu +; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vmv1r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <8 x float> %x, <8 x float> %y, <16 x i32> + ret <16 x float> %a +} + +define <32 x half> @interleave_v16f16(<16 x half> %x, <16 x half> %y) { +; V128-LABEL: interleave_v16f16: +; V128: # %bb.0: +; V128-NEXT: li a0, 32 +; V128-NEXT: vsetvli zero, a0, e16, m2, ta, mu +; V128-NEXT: vwaddu.vv v12, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v12, a0, v10 +; V128-NEXT: vmv4r.v v8, v12 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v16f16: +; V512: # %bb.0: +; V512-NEXT: li a0, 32 +; V512-NEXT: vsetvli zero, a0, e16, mf2, ta, mu +; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vmv1r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <16 x half> %x, <16 x half> %y, <32 x i32> + ret <32 x half> %a +} + +define <32 x float> @interleave_v16f32(<16 x float> %x, <16 x float> %y) { +; V128-LABEL: interleave_v16f32: +; V128: # %bb.0: +; V128-NEXT: li a0, 32 +; V128-NEXT: vsetvli zero, a0, e32, m4, ta, mu +; V128-NEXT: vwaddu.vv v16, v8, v12 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v16, a0, v12 +; V128-NEXT: vmv8r.v v8, v16 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v16f32: +; V512: # %bb.0: +; V512-NEXT: li a0, 32 +; V512-NEXT: vsetvli zero, a0, e32, m1, ta, mu +; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vmv2r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <16 x float> %x, <16 x float> %y, <32 x i32> + ret <32 x float> %a +} + +define <64 x half> @interleave_v32f16(<32 x half> %x, <32 x half> %y) { +; V128-LABEL: interleave_v32f16: +; V128: # %bb.0: +; V128-NEXT: li a0, 64 +; V128-NEXT: vsetvli zero, a0, e16, m4, ta, mu +; V128-NEXT: vwaddu.vv v16, v8, v12 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v16, a0, v12 +; V128-NEXT: vmv8r.v v8, v16 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v32f16: +; V512: # %bb.0: +; V512-NEXT: li a0, 64 +; V512-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vmv2r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <32 x half> %x, <32 x half> %y, <64 x i32> + ret <64 x half> %a +} + +define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) { +; RV32-V128-LABEL: interleave_v32f32: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: addi sp, sp, -16 +; RV32-V128-NEXT: .cfi_def_cfa_offset 16 +; RV32-V128-NEXT: csrr a0, vlenb +; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: sub sp, sp, a0 +; RV32-V128-NEXT: lui a0, %hi(.LCPI10_0) +; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_0) +; RV32-V128-NEXT: li a1, 32 +; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV32-V128-NEXT: vle32.v v0, (a0) +; RV32-V128-NEXT: vmv8r.v v24, v8 +; RV32-V128-NEXT: addi a0, sp, 16 +; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-V128-NEXT: vrgather.vv v8, v24, v0 +; RV32-V128-NEXT: lui a0, %hi(.LCPI10_1) +; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_1) +; RV32-V128-NEXT: vle32.v v24, (a0) +; RV32-V128-NEXT: csrr a0, vlenb +; RV32-V128-NEXT: slli a0, a0, 3 +; RV32-V128-NEXT: add a0, sp, a0 +; RV32-V128-NEXT: addi a0, a0, 16 +; RV32-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-V128-NEXT: lui a0, 699051 +; RV32-V128-NEXT: addi a0, a0, -1366 +; RV32-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-V128-NEXT: vmv.s.x v0, a0 +; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV32-V128-NEXT: csrr a0, vlenb +; RV32-V128-NEXT: slli a0, a0, 3 +; RV32-V128-NEXT: add a0, sp, a0 +; RV32-V128-NEXT: addi a0, a0, 16 +; RV32-V128-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-V128-NEXT: vrgather.vv v8, v16, v24, v0.t +; RV32-V128-NEXT: vmv.v.v v24, v8 +; RV32-V128-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; RV32-V128-NEXT: addi a0, sp, 16 +; RV32-V128-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV32-V128-NEXT: vwaddu.vv v0, v8, v16 +; RV32-V128-NEXT: li a0, -1 +; RV32-V128-NEXT: vwmaccu.vx v0, a0, v16 +; RV32-V128-NEXT: vmv8r.v v8, v0 +; RV32-V128-NEXT: vmv8r.v v16, v24 +; RV32-V128-NEXT: csrr a0, vlenb +; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: add sp, sp, a0 +; RV32-V128-NEXT: addi sp, sp, 16 +; RV32-V128-NEXT: ret +; +; RV64-V128-LABEL: interleave_v32f32: +; RV64-V128: # %bb.0: +; RV64-V128-NEXT: addi sp, sp, -16 +; RV64-V128-NEXT: .cfi_def_cfa_offset 16 +; RV64-V128-NEXT: csrr a0, vlenb +; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: sub sp, sp, a0 +; RV64-V128-NEXT: lui a0, %hi(.LCPI10_0) +; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_0) +; RV64-V128-NEXT: li a1, 32 +; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV64-V128-NEXT: vle32.v v0, (a0) +; RV64-V128-NEXT: vmv8r.v v24, v8 +; RV64-V128-NEXT: addi a0, sp, 16 +; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-V128-NEXT: vrgather.vv v8, v24, v0 +; RV64-V128-NEXT: lui a0, %hi(.LCPI10_1) +; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_1) +; RV64-V128-NEXT: vle32.v v24, (a0) +; RV64-V128-NEXT: csrr a0, vlenb +; RV64-V128-NEXT: slli a0, a0, 3 +; RV64-V128-NEXT: add a0, sp, a0 +; RV64-V128-NEXT: addi a0, a0, 16 +; RV64-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV64-V128-NEXT: lui a0, 699051 +; RV64-V128-NEXT: addiw a0, a0, -1366 +; RV64-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV64-V128-NEXT: vmv.s.x v0, a0 +; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV64-V128-NEXT: csrr a0, vlenb +; RV64-V128-NEXT: slli a0, a0, 3 +; RV64-V128-NEXT: add a0, sp, a0 +; RV64-V128-NEXT: addi a0, a0, 16 +; RV64-V128-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV64-V128-NEXT: vrgather.vv v8, v16, v24, v0.t +; RV64-V128-NEXT: vmv.v.v v24, v8 +; RV64-V128-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; RV64-V128-NEXT: addi a0, sp, 16 +; RV64-V128-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV64-V128-NEXT: vwaddu.vv v0, v8, v16 +; RV64-V128-NEXT: li a0, -1 +; RV64-V128-NEXT: vwmaccu.vx v0, a0, v16 +; RV64-V128-NEXT: vmv8r.v v8, v0 +; RV64-V128-NEXT: vmv8r.v v16, v24 +; RV64-V128-NEXT: csrr a0, vlenb +; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: add sp, sp, a0 +; RV64-V128-NEXT: addi sp, sp, 16 +; RV64-V128-NEXT: ret +; +; V512-LABEL: interleave_v32f32: +; V512: # %bb.0: +; V512-NEXT: li a0, 64 +; V512-NEXT: vsetvli zero, a0, e32, m2, ta, mu +; V512-NEXT: vwaddu.vv v12, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v12, a0, v10 +; V512-NEXT: vmv4r.v v8, v12 +; V512-NEXT: ret + %a = shufflevector <32 x float> %x, <32 x float> %y, <64 x i32> + ret <64 x float> %a +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll new file mode 100644 index 0000000..a17a831 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -0,0 +1,484 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV32-V128 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV64-V128 +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV32-V512 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV64-V512 + +; Test optimizing interleaves to widening arithmetic. + +define <4 x i8> @interleave_v2i8(<2 x i8> %x, <2 x i8> %y) { +; CHECK-LABEL: interleave_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf8, ta, mu +; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %a = shufflevector <2 x i8> %x, <2 x i8> %y, <4 x i32> + ret <4 x i8> %a +} + +define <4 x i16> @interleave_v2i16(<2 x i16> %x, <2 x i16> %y) { +; CHECK-LABEL: interleave_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf4, ta, mu +; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %a = shufflevector <2 x i16> %x, <2 x i16> %y, <4 x i32> + ret <4 x i16> %a +} + +; Vector order switched for coverage. +define <4 x i32> @interleave_v2i32(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: interleave_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, mf2, ta, mu +; CHECK-NEXT: vwaddu.vv v10, v9, v8 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v10, a0, v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %a = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> + ret <4 x i32> %a +} + +; One vXi64 test case to very that we don't optimize it. +; FIXME: Is there better codegen we can do here? +define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) { +; RV32-V128-LABEL: interleave_v2i64: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: vmv1r.v v12, v9 +; RV32-V128-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; RV32-V128-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV32-V128-NEXT: vid.v v10 +; RV32-V128-NEXT: vsrl.vi v14, v10, 1 +; RV32-V128-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-V128-NEXT: vrgatherei16.vv v10, v8, v14 +; RV32-V128-NEXT: li a0, 10 +; RV32-V128-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV32-V128-NEXT: vmv.s.x v0, a0 +; RV32-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32-V128-NEXT: vrgatherei16.vv v10, v12, v14, v0.t +; RV32-V128-NEXT: vmv.v.v v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V128-LABEL: interleave_v2i64: +; RV64-V128: # %bb.0: +; RV64-V128-NEXT: vmv1r.v v12, v9 +; RV64-V128-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64-V128-NEXT: vid.v v10 +; RV64-V128-NEXT: vsrl.vi v14, v10, 1 +; RV64-V128-NEXT: vrgather.vv v10, v8, v14 +; RV64-V128-NEXT: li a0, 10 +; RV64-V128-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV64-V128-NEXT: vmv.s.x v0, a0 +; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64-V128-NEXT: vrgather.vv v10, v12, v14, v0.t +; RV64-V128-NEXT: vmv.v.v v8, v10 +; RV64-V128-NEXT: ret +; +; RV32-V512-LABEL: interleave_v2i64: +; RV32-V512: # %bb.0: +; RV32-V512-NEXT: vsetivli zero, 4, e16, mf4, ta, mu +; RV32-V512-NEXT: vid.v v10 +; RV32-V512-NEXT: vsrl.vi v11, v10, 1 +; RV32-V512-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-V512-NEXT: vrgatherei16.vv v10, v8, v11 +; RV32-V512-NEXT: li a0, 10 +; RV32-V512-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV32-V512-NEXT: vmv.s.x v0, a0 +; RV32-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu +; RV32-V512-NEXT: vrgatherei16.vv v10, v9, v11, v0.t +; RV32-V512-NEXT: vmv.v.v v8, v10 +; RV32-V512-NEXT: ret +; +; RV64-V512-LABEL: interleave_v2i64: +; RV64-V512: # %bb.0: +; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu +; RV64-V512-NEXT: vid.v v10 +; RV64-V512-NEXT: vsrl.vi v11, v10, 1 +; RV64-V512-NEXT: vrgather.vv v10, v8, v11 +; RV64-V512-NEXT: li a0, 10 +; RV64-V512-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV64-V512-NEXT: vmv.s.x v0, a0 +; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu +; RV64-V512-NEXT: vrgather.vv v10, v9, v11, v0.t +; RV64-V512-NEXT: vmv.v.v v8, v10 +; RV64-V512-NEXT: ret + %a = shufflevector <2 x i64> %x, <2 x i64> %y, <4 x i32> + ret <4 x i64> %a +} + +; Vector order switched for coverage. +define <8 x i8> @interleave_v4i8(<4 x i8> %x, <4 x i8> %y) { +; V128-LABEL: interleave_v4i8: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 8, e8, mf4, ta, mu +; V128-NEXT: vwaddu.vv v10, v9, v8 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v10, a0, v8 +; V128-NEXT: vmv1r.v v8, v10 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v4i8: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 8, e8, mf8, ta, mu +; V512-NEXT: vwaddu.vv v10, v9, v8 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v8 +; V512-NEXT: vmv1r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> + ret <8 x i8> %a +} + +; Undef elements for coverage +define <8 x i16> @interleave_v4i16(<4 x i16> %x, <4 x i16> %y) { +; V128-LABEL: interleave_v4i16: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; V128-NEXT: vwaddu.vv v10, v8, v9 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v10, a0, v9 +; V128-NEXT: vmv1r.v v8, v10 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v4i16: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, mu +; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vmv1r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <4 x i16> %x, <4 x i16> %y, <8 x i32> + ret <8 x i16> %a +} + +define <8 x i32> @interleave_v4i32(<4 x i32> %x, <4 x i32> %y) { +; V128-LABEL: interleave_v4i32: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 8, e32, m1, ta, mu +; V128-NEXT: vwaddu.vv v10, v8, v9 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v10, a0, v9 +; V128-NEXT: vmv2r.v v8, v10 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v4i32: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, mu +; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vmv1r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <4 x i32> %x, <4 x i32> %y, <8 x i32> + ret <8 x i32> %a +} + +define <16 x i8> @interleave_v8i8(<8 x i8> %x, <8 x i8> %y) { +; V128-LABEL: interleave_v8i8: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 16, e8, mf2, ta, mu +; V128-NEXT: vwaddu.vv v10, v8, v9 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v10, a0, v9 +; V128-NEXT: vmv1r.v v8, v10 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v8i8: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 16, e8, mf8, ta, mu +; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vmv1r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> + ret <16 x i8> %a +} + +; Vector order switched for coverage. +define <16 x i16> @interleave_v8i16(<8 x i16> %x, <8 x i16> %y) { +; V128-LABEL: interleave_v8i16: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 16, e16, m1, ta, mu +; V128-NEXT: vwaddu.vv v10, v9, v8 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v10, a0, v8 +; V128-NEXT: vmv2r.v v8, v10 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v8i16: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 16, e16, mf4, ta, mu +; V512-NEXT: vwaddu.vv v10, v9, v8 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v8 +; V512-NEXT: vmv1r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <8 x i16> %x, <8 x i16> %y, <16 x i32> + ret <16 x i16> %a +} + +define <16 x i32> @interleave_v8i32(<8 x i32> %x, <8 x i32> %y) { +; V128-LABEL: interleave_v8i32: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 16, e32, m2, ta, mu +; V128-NEXT: vwaddu.vv v12, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v12, a0, v10 +; V128-NEXT: vmv4r.v v8, v12 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v8i32: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 16, e32, mf2, ta, mu +; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vmv1r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <8 x i32> %x, <8 x i32> %y, <16 x i32> + ret <16 x i32> %a +} + +define <32 x i8> @interleave_v16i8(<16 x i8> %x, <16 x i8> %y) { +; V128-LABEL: interleave_v16i8: +; V128: # %bb.0: +; V128-NEXT: li a0, 32 +; V128-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; V128-NEXT: vwaddu.vv v10, v8, v9 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v10, a0, v9 +; V128-NEXT: vmv2r.v v8, v10 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v16i8: +; V512: # %bb.0: +; V512-NEXT: li a0, 32 +; V512-NEXT: vsetvli zero, a0, e8, mf4, ta, mu +; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vmv1r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <16 x i8> %x, <16 x i8> %y, <32 x i32> + ret <32 x i8> %a +} + +define <32 x i16> @interleave_v16i16(<16 x i16> %x, <16 x i16> %y) { +; V128-LABEL: interleave_v16i16: +; V128: # %bb.0: +; V128-NEXT: li a0, 32 +; V128-NEXT: vsetvli zero, a0, e16, m2, ta, mu +; V128-NEXT: vwaddu.vv v12, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v12, a0, v10 +; V128-NEXT: vmv4r.v v8, v12 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v16i16: +; V512: # %bb.0: +; V512-NEXT: li a0, 32 +; V512-NEXT: vsetvli zero, a0, e16, mf2, ta, mu +; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vmv1r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <16 x i16> %x, <16 x i16> %y, <32 x i32> + ret <32 x i16> %a +} + +define <32 x i32> @interleave_v16i32(<16 x i32> %x, <16 x i32> %y) { +; V128-LABEL: interleave_v16i32: +; V128: # %bb.0: +; V128-NEXT: li a0, 32 +; V128-NEXT: vsetvli zero, a0, e32, m4, ta, mu +; V128-NEXT: vwaddu.vv v16, v8, v12 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v16, a0, v12 +; V128-NEXT: vmv8r.v v8, v16 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v16i32: +; V512: # %bb.0: +; V512-NEXT: li a0, 32 +; V512-NEXT: vsetvli zero, a0, e32, m1, ta, mu +; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vmv2r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <16 x i32> %x, <16 x i32> %y, <32 x i32> + ret <32 x i32> %a +} + +define <64 x i8> @interleave_v32i8(<32 x i8> %x, <32 x i8> %y) { +; V128-LABEL: interleave_v32i8: +; V128: # %bb.0: +; V128-NEXT: li a0, 64 +; V128-NEXT: vsetvli zero, a0, e8, m2, ta, mu +; V128-NEXT: vwaddu.vv v12, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v12, a0, v10 +; V128-NEXT: vmv4r.v v8, v12 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v32i8: +; V512: # %bb.0: +; V512-NEXT: li a0, 64 +; V512-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vmv1r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <32 x i8> %x, <32 x i8> %y, <64 x i32> + ret <64 x i8> %a +} + +define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) { +; V128-LABEL: interleave_v32i16: +; V128: # %bb.0: +; V128-NEXT: li a0, 64 +; V128-NEXT: vsetvli zero, a0, e16, m4, ta, mu +; V128-NEXT: vwaddu.vv v16, v8, v12 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v16, a0, v12 +; V128-NEXT: vmv8r.v v8, v16 +; V128-NEXT: ret +; +; V512-LABEL: interleave_v32i16: +; V512: # %bb.0: +; V512-NEXT: li a0, 64 +; V512-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vmv2r.v v8, v10 +; V512-NEXT: ret + %a = shufflevector <32 x i16> %x, <32 x i16> %y, <64 x i32> + ret <64 x i16> %a +} + +define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) { +; RV32-V128-LABEL: interleave_v32i32: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: addi sp, sp, -16 +; RV32-V128-NEXT: .cfi_def_cfa_offset 16 +; RV32-V128-NEXT: csrr a0, vlenb +; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: sub sp, sp, a0 +; RV32-V128-NEXT: lui a0, %hi(.LCPI15_0) +; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI15_0) +; RV32-V128-NEXT: li a1, 32 +; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV32-V128-NEXT: vle32.v v0, (a0) +; RV32-V128-NEXT: vmv8r.v v24, v8 +; RV32-V128-NEXT: addi a0, sp, 16 +; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-V128-NEXT: vrgather.vv v8, v24, v0 +; RV32-V128-NEXT: lui a0, %hi(.LCPI15_1) +; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI15_1) +; RV32-V128-NEXT: vle32.v v24, (a0) +; RV32-V128-NEXT: csrr a0, vlenb +; RV32-V128-NEXT: slli a0, a0, 3 +; RV32-V128-NEXT: add a0, sp, a0 +; RV32-V128-NEXT: addi a0, a0, 16 +; RV32-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-V128-NEXT: lui a0, 699051 +; RV32-V128-NEXT: addi a0, a0, -1366 +; RV32-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-V128-NEXT: vmv.s.x v0, a0 +; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV32-V128-NEXT: csrr a0, vlenb +; RV32-V128-NEXT: slli a0, a0, 3 +; RV32-V128-NEXT: add a0, sp, a0 +; RV32-V128-NEXT: addi a0, a0, 16 +; RV32-V128-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-V128-NEXT: vrgather.vv v8, v16, v24, v0.t +; RV32-V128-NEXT: vmv.v.v v24, v8 +; RV32-V128-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; RV32-V128-NEXT: addi a0, sp, 16 +; RV32-V128-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV32-V128-NEXT: vwaddu.vv v0, v8, v16 +; RV32-V128-NEXT: li a0, -1 +; RV32-V128-NEXT: vwmaccu.vx v0, a0, v16 +; RV32-V128-NEXT: vmv8r.v v8, v0 +; RV32-V128-NEXT: vmv8r.v v16, v24 +; RV32-V128-NEXT: csrr a0, vlenb +; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: add sp, sp, a0 +; RV32-V128-NEXT: addi sp, sp, 16 +; RV32-V128-NEXT: ret +; +; RV64-V128-LABEL: interleave_v32i32: +; RV64-V128: # %bb.0: +; RV64-V128-NEXT: addi sp, sp, -16 +; RV64-V128-NEXT: .cfi_def_cfa_offset 16 +; RV64-V128-NEXT: csrr a0, vlenb +; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: sub sp, sp, a0 +; RV64-V128-NEXT: lui a0, %hi(.LCPI15_0) +; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI15_0) +; RV64-V128-NEXT: li a1, 32 +; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV64-V128-NEXT: vle32.v v0, (a0) +; RV64-V128-NEXT: vmv8r.v v24, v8 +; RV64-V128-NEXT: addi a0, sp, 16 +; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-V128-NEXT: vrgather.vv v8, v24, v0 +; RV64-V128-NEXT: lui a0, %hi(.LCPI15_1) +; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI15_1) +; RV64-V128-NEXT: vle32.v v24, (a0) +; RV64-V128-NEXT: csrr a0, vlenb +; RV64-V128-NEXT: slli a0, a0, 3 +; RV64-V128-NEXT: add a0, sp, a0 +; RV64-V128-NEXT: addi a0, a0, 16 +; RV64-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV64-V128-NEXT: lui a0, 699051 +; RV64-V128-NEXT: addiw a0, a0, -1366 +; RV64-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV64-V128-NEXT: vmv.s.x v0, a0 +; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV64-V128-NEXT: csrr a0, vlenb +; RV64-V128-NEXT: slli a0, a0, 3 +; RV64-V128-NEXT: add a0, sp, a0 +; RV64-V128-NEXT: addi a0, a0, 16 +; RV64-V128-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV64-V128-NEXT: vrgather.vv v8, v16, v24, v0.t +; RV64-V128-NEXT: vmv.v.v v24, v8 +; RV64-V128-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; RV64-V128-NEXT: addi a0, sp, 16 +; RV64-V128-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV64-V128-NEXT: vwaddu.vv v0, v8, v16 +; RV64-V128-NEXT: li a0, -1 +; RV64-V128-NEXT: vwmaccu.vx v0, a0, v16 +; RV64-V128-NEXT: vmv8r.v v8, v0 +; RV64-V128-NEXT: vmv8r.v v16, v24 +; RV64-V128-NEXT: csrr a0, vlenb +; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: add sp, sp, a0 +; RV64-V128-NEXT: addi sp, sp, 16 +; RV64-V128-NEXT: ret +; +; V512-LABEL: interleave_v32i32: +; V512: # %bb.0: +; V512-NEXT: li a0, 64 +; V512-NEXT: vsetvli zero, a0, e32, m2, ta, mu +; V512-NEXT: vwaddu.vv v12, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v12, a0, v10 +; V512-NEXT: vmv4r.v v8, v12 +; V512-NEXT: ret + %a = shufflevector <32 x i32> %x, <32 x i32> %y, <64 x i32> + ret <64 x i32> %a +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index 97ec81c..1b8f1d2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -314,16 +314,13 @@ define <8 x i64> @vrgather_shuffle_vx_v8i64(<8 x i64> %x) { define <4 x i8> @interleave_shuffles(<4 x i8> %x) { ; CHECK-LABEL: interleave_shuffles: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; CHECK-NEXT: vrgather.vi v9, v8, 1 -; CHECK-NEXT: li a1, 10 -; CHECK-NEXT: vmv.s.x v0, a1 -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vrgather.vi v9, v8, 0 +; CHECK-NEXT: vrgather.vi v10, v8, 1 +; CHECK-NEXT: vsetivli zero, 4, e8, mf8, ta, mu +; CHECK-NEXT: vwaddu.vv v8, v9, v10 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vwmaccu.vx v8, a0, v10 ; CHECK-NEXT: ret %y = shufflevector <4 x i8> %x, <4 x i8> undef, <4 x i32> %z = shufflevector <4 x i8> %x, <4 x i8> undef, <4 x i32> -- 2.7.4