From 42944abf8583fc6efae5bbc39f092bf884f6d17c Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 17 Feb 2023 10:00:40 -0800 Subject: [PATCH] [RISCV] Improve isInterleaveShuffle to handle interleaving the high half and low half of the same source. This is needed to support the new interleave intrinsics from D141924 for fixed vectors. I've reworked the core loop to operate in terms of half of a source. Making 4 possible half sources. The first element of the half is used to indicate which source using the same numbering as the shuffle where the second source elements are numbered after the first source. I've added restrictions to only match the first half of two vectors or the first and second half of a single vector. This was done to prevent regressions on the cases we have coverage for. I saw cases where generic DAG combine split a single interleave into 2 smaller interleaves a concat. We can revisit in the future. Reviewed By: reames Differential Revision: https://reviews.llvm.org/D144143 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 90 +++++---- .../RISCV/rvv/fixed-vectors-fp-interleave.ll | 149 +++++++++++++++ .../RISCV/rvv/fixed-vectors-int-interleave.ll | 201 +++++++++++++++++++++ 3 files changed, 404 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 087dd3b..9f053b9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2901,13 +2901,20 @@ static SDValue lowerScalarInsert(SDValue Scalar, SDValue VL, } -static bool isInterleaveShuffle(ArrayRef Mask, MVT VT, bool &SwapSources, - const RISCVSubtarget &Subtarget) { +/// Is this shuffle interleaving contiguous elements from one vector into the +/// even elements and contiguous elements from another vector into the odd +/// elements. \p Src1 will contain the element that should be in the first even +/// element. \p Src2 will contain the element that should be in the first odd +/// element. These can be the first element in a source or the element half +/// way through the source. +static bool isInterleaveShuffle(ArrayRef Mask, MVT VT, int &EvenSrc, + int &OddSrc, const RISCVSubtarget &Subtarget) { // We need to be able to widen elements to the next larger integer type. if (VT.getScalarSizeInBits() >= Subtarget.getELEN()) return false; int Size = Mask.size(); + int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); int Srcs[] = {-1, -1}; @@ -2919,8 +2926,8 @@ static bool isInterleaveShuffle(ArrayRef Mask, MVT VT, bool &SwapSources, // Is this an even or odd element. int Pol = i % 2; - // Ensure we consistently use the same source for this element polarity. - int Src = Mask[i] / Size; + // Ensure we consistently use the same half source for this polarity. + int Src = alignDown(Mask[i], HalfSize); if (Srcs[Pol] < 0) Srcs[Pol] = Src; if (Srcs[Pol] != Src) @@ -2928,17 +2935,24 @@ static bool isInterleaveShuffle(ArrayRef Mask, MVT VT, bool &SwapSources, // Make sure the element within the source is appropriate for this element // in the destination. - int Elt = Mask[i] % Size; + int Elt = Mask[i] % HalfSize; if (Elt != i / 2) return false; } - // We need to find a source for each polarity and they can't be the same. - if (Srcs[0] < 0 || Srcs[1] < 0 || Srcs[0] == Srcs[1]) + // One source should be low half of first vector. + if (Srcs[0] != 0 && Srcs[1] != 0) + return false; + + // Other source should be the upper half of the first source or the lower + // half of the second source. + // FIXME: This is only a heuristic to avoid regressions. + if (Srcs[0] != HalfSize && Srcs[0] != Size && Srcs[1] != HalfSize && + Srcs[1] != Size) return false; - // Swap the sources if the second source was in the even polarity. - SwapSources = Srcs[0] > Srcs[1]; + EvenSrc = Srcs[0]; + OddSrc = Srcs[1]; return true; } @@ -3338,18 +3352,22 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, // Detect an interleave shuffle and lower to // (vmaccu.vx (vwaddu.vx lohalf(V1), lohalf(V2)), lohalf(V2), (2^eltbits - 1)) - bool SwapSources; - if (isInterleaveShuffle(Mask, VT, SwapSources, Subtarget)) { - // Swap sources if needed. - if (SwapSources) - std::swap(V1, V2); - - // Extract the lower half of the vectors. + int EvenSrc, OddSrc; + if (isInterleaveShuffle(Mask, VT, EvenSrc, OddSrc, Subtarget)) { + // Extract the halves of the vectors. MVT HalfVT = VT.getHalfNumVectorElementsVT(); - V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, - DAG.getConstant(0, DL, XLenVT)); - V2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V2, - DAG.getConstant(0, DL, XLenVT)); + + int Size = Mask.size(); + SDValue EvenV, OddV; + assert(EvenSrc >= 0 && "Undef source?"); + EvenV = (EvenSrc / Size) == 0 ? V1 : V2; + EvenV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, EvenV, + DAG.getConstant(EvenSrc % Size, DL, XLenVT)); + + assert(OddSrc >= 0 && "Undef source?"); + OddV = (OddSrc / Size) == 0 ? V1 : V2; + OddV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, OddV, + DAG.getConstant(OddSrc % Size, DL, XLenVT)); // Double the element width and halve the number of elements in an int type. unsigned EltBits = VT.getScalarSizeInBits(); @@ -3365,36 +3383,37 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, // larger type. MVT HalfContainerVT = MVT::getVectorVT( VT.getVectorElementType(), WideIntContainerVT.getVectorElementCount()); - V1 = convertToScalableVector(HalfContainerVT, V1, DAG, Subtarget); - V2 = convertToScalableVector(HalfContainerVT, V2, DAG, Subtarget); + EvenV = convertToScalableVector(HalfContainerVT, EvenV, DAG, Subtarget); + OddV = convertToScalableVector(HalfContainerVT, OddV, DAG, Subtarget); // Cast sources to integer. MVT IntEltVT = MVT::getIntegerVT(EltBits); MVT IntHalfVT = MVT::getVectorVT(IntEltVT, HalfContainerVT.getVectorElementCount()); - V1 = DAG.getBitcast(IntHalfVT, V1); - V2 = DAG.getBitcast(IntHalfVT, V2); + EvenV = DAG.getBitcast(IntHalfVT, EvenV); + OddV = DAG.getBitcast(IntHalfVT, OddV); - // Freeze V2 since we use it twice and we need to be sure that the add and + // Freeze OddV since we use it twice and we need to be sure that the add and // multiply see the same value. - V2 = DAG.getFreeze(V2); + OddV = DAG.getFreeze(OddV); // Recreate TrueMask using the widened type's element count. TrueMask = getAllOnesMask(HalfContainerVT, VL, DL, DAG); - // Widen V1 and V2 with 0s and add one copy of V2 to V1. + // Widen EvenV and OddV with 0s and add one copy of OddV to EvenV. SDValue Add = - DAG.getNode(RISCVISD::VWADDU_VL, DL, WideIntContainerVT, V1, V2, + DAG.getNode(RISCVISD::VWADDU_VL, DL, WideIntContainerVT, EvenV, OddV, DAG.getUNDEF(WideIntContainerVT), TrueMask, VL); - // Create 2^eltbits - 1 copies of V2 by multiplying by the largest integer. + // Create 2^eltbits - 1 copies of OddV by multiplying by the largest + // integer. SDValue Multiplier = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntHalfVT, DAG.getUNDEF(IntHalfVT), DAG.getAllOnesConstant(DL, XLenVT), VL); SDValue WidenMul = - DAG.getNode(RISCVISD::VWMULU_VL, DL, WideIntContainerVT, V2, Multiplier, - DAG.getUNDEF(WideIntContainerVT), TrueMask, VL); + DAG.getNode(RISCVISD::VWMULU_VL, DL, WideIntContainerVT, OddV, + Multiplier, DAG.getUNDEF(WideIntContainerVT), TrueMask, VL); // Add the new copies to our previous addition giving us 2^eltbits copies of - // V2. This is equivalent to shifting V2 left by eltbits. This should + // OddV. This is equivalent to shifting OddV left by eltbits. This should // combine with the vwmulu.vv above to form vwmaccu.vv. Add = DAG.getNode(RISCVISD::ADD_VL, DL, WideIntContainerVT, Add, WidenMul, DAG.getUNDEF(WideIntContainerVT), TrueMask, VL); @@ -3555,10 +3574,9 @@ bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { MVT SVT = VT.getSimpleVT(); - bool SwapSources; - int LoSrc, HiSrc; - return (isElementRotate(LoSrc, HiSrc, M) > 0) || - isInterleaveShuffle(M, SVT, SwapSources, Subtarget); + int Dummy1, Dummy2; + return (isElementRotate(Dummy1, Dummy2, M) > 0) || + isInterleaveShuffle(M, SVT, Dummy1, Dummy2, Subtarget); } // Lower CTLZ_ZERO_UNDEF or CTTZ_ZERO_UNDEF by converting to FP and extracting diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll index 334a3d2..b9050ae 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -370,3 +370,152 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) { %a = shufflevector <32 x float> %x, <32 x float> %y, <64 x i32> ret <64 x float> %a } + +define <4 x half> @unary_interleave_v4f16(<4 x half> %x) { +; V128-LABEL: unary_interleave_v4f16: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; V128-NEXT: vslidedown.vi v10, v8, 2 +; V128-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v4f16: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 2 +; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <4 x half> %x, <4 x half> poison, <4 x i32> + ret <4 x half> %a +} + +define <4 x float> @unary_interleave_v4f32(<4 x float> %x) { +; V128-LABEL: unary_interleave_v4f32: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; V128-NEXT: vslidedown.vi v10, v8, 2 +; V128-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v4f32: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 2 +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> + ret <4 x float> %a +} + +; FIXME: Is there better codegen we can do here? +define <4 x double> @unary_interleave_v4f64(<4 x double> %x) { +; RV32-V128-LABEL: unary_interleave_v4f64: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI13_0) +; RV32-V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-V128-NEXT: vle16.v v12, (a0) +; RV32-V128-NEXT: vrgatherei16.vv v10, v8, v12 +; RV32-V128-NEXT: vmv.v.v v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V128-LABEL: unary_interleave_v4f64: +; RV64-V128: # %bb.0: +; RV64-V128-NEXT: lui a0, %hi(.LCPI13_0) +; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI13_0) +; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-V128-NEXT: vle64.v v12, (a0) +; RV64-V128-NEXT: vrgather.vv v10, v8, v12 +; RV64-V128-NEXT: vmv.v.v v8, v10 +; RV64-V128-NEXT: ret +; +; RV32-V512-LABEL: unary_interleave_v4f64: +; RV32-V512: # %bb.0: +; RV32-V512-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-V512-NEXT: addi a0, a0, %lo(.LCPI13_0) +; RV32-V512-NEXT: vsetivli zero, 4, e64, m1, ta, ma +; RV32-V512-NEXT: vle16.v v10, (a0) +; RV32-V512-NEXT: vrgatherei16.vv v9, v8, v10 +; RV32-V512-NEXT: vmv.v.v v8, v9 +; RV32-V512-NEXT: ret +; +; RV64-V512-LABEL: unary_interleave_v4f64: +; RV64-V512: # %bb.0: +; RV64-V512-NEXT: lui a0, %hi(.LCPI13_0) +; RV64-V512-NEXT: addi a0, a0, %lo(.LCPI13_0) +; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, ma +; RV64-V512-NEXT: vle64.v v10, (a0) +; RV64-V512-NEXT: vrgather.vv v9, v8, v10 +; RV64-V512-NEXT: vmv.v.v v8, v9 +; RV64-V512-NEXT: ret + %a = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> + ret <4 x double> %a +} + +define <8 x half> @unary_interleave_v8f16(<8 x half> %x) { +; V128-LABEL: unary_interleave_v8f16: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; V128-NEXT: vslidedown.vi v10, v8, 4 +; V128-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v8f16: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 4 +; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, ma +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <8 x half> %x, <8 x half> poison, <8 x i32> + ret <8 x half> %a +} + +define <8 x float> @unary_interleave_v8f32(<8 x float> %x) { +; V128-LABEL: unary_interleave_v8f32: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; V128-NEXT: vslidedown.vi v12, v8, 4 +; V128-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; V128-NEXT: vwaddu.vv v10, v12, v8 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v10, a0, v8 +; V128-NEXT: vmv2r.v v8, v10 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v8f32: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 4 +; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, ma +; V512-NEXT: vwaddu.vv v9, v10, v8 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v8 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> + ret <8 x float> %a +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll index 24e7c15..c148faf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -476,3 +476,204 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) { %a = shufflevector <32 x i32> %x, <32 x i32> %y, <64 x i32> ret <64 x i32> %a } + +define <4 x i8> @unary_interleave_v4i8(<4 x i8> %x) { +; V128-LABEL: unary_interleave_v4i8: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; V128-NEXT: vslidedown.vi v10, v8, 2 +; V128-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v4i8: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 2 +; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> + ret <4 x i8> %a +} + +define <4 x i16> @unary_interleave_v4i16(<4 x i16> %x) { +; V128-LABEL: unary_interleave_v4i16: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; V128-NEXT: vslidedown.vi v10, v8, 2 +; V128-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v4i16: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 2 +; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <4 x i16> %x, <4 x i16> poison, <4 x i32> + ret <4 x i16> %a +} + +define <4 x i32> @unary_interleave_v4i32(<4 x i32> %x) { +; V128-LABEL: unary_interleave_v4i32: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; V128-NEXT: vslidedown.vi v10, v8, 2 +; V128-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v4i32: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 2 +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> + ret <4 x i32> %a +} + +; FIXME: Is there better codegen we can do here? +define <4 x i64> @unary_interleave_v4i64(<4 x i64> %x) { +; RV32-V128-LABEL: unary_interleave_v4i64: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: lui a0, %hi(.LCPI19_0) +; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI19_0) +; RV32-V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-V128-NEXT: vle16.v v12, (a0) +; RV32-V128-NEXT: vrgatherei16.vv v10, v8, v12 +; RV32-V128-NEXT: vmv.v.v v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V128-LABEL: unary_interleave_v4i64: +; RV64-V128: # %bb.0: +; RV64-V128-NEXT: lui a0, %hi(.LCPI19_0) +; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI19_0) +; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-V128-NEXT: vle64.v v12, (a0) +; RV64-V128-NEXT: vrgather.vv v10, v8, v12 +; RV64-V128-NEXT: vmv.v.v v8, v10 +; RV64-V128-NEXT: ret +; +; RV32-V512-LABEL: unary_interleave_v4i64: +; RV32-V512: # %bb.0: +; RV32-V512-NEXT: lui a0, %hi(.LCPI19_0) +; RV32-V512-NEXT: addi a0, a0, %lo(.LCPI19_0) +; RV32-V512-NEXT: vsetivli zero, 4, e64, m1, ta, ma +; RV32-V512-NEXT: vle16.v v10, (a0) +; RV32-V512-NEXT: vrgatherei16.vv v9, v8, v10 +; RV32-V512-NEXT: vmv.v.v v8, v9 +; RV32-V512-NEXT: ret +; +; RV64-V512-LABEL: unary_interleave_v4i64: +; RV64-V512: # %bb.0: +; RV64-V512-NEXT: lui a0, %hi(.LCPI19_0) +; RV64-V512-NEXT: addi a0, a0, %lo(.LCPI19_0) +; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, ma +; RV64-V512-NEXT: vle64.v v10, (a0) +; RV64-V512-NEXT: vrgather.vv v9, v8, v10 +; RV64-V512-NEXT: vmv.v.v v8, v9 +; RV64-V512-NEXT: ret + %a = shufflevector <4 x i64> %x, <4 x i64> poison, <4 x i32> + ret <4 x i64> %a +} + +define <8 x i8> @unary_interleave_v8i8(<8 x i8> %x) { +; V128-LABEL: unary_interleave_v8i8: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; V128-NEXT: vslidedown.vi v10, v8, 4 +; V128-NEXT: vsetivli zero, 8, e8, mf4, ta, ma +; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v8i8: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 4 +; V512-NEXT: vsetivli zero, 8, e8, mf8, ta, ma +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <8 x i8> %x, <8 x i8> poison, <8 x i32> + ret <8 x i8> %a +} + +define <8 x i16> @unary_interleave_v8i16(<8 x i16> %x) { +; V128-LABEL: unary_interleave_v8i16: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; V128-NEXT: vslidedown.vi v10, v8, 4 +; V128-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; V128-NEXT: vwaddu.vv v9, v10, v8 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v9, a0, v8 +; V128-NEXT: vmv1r.v v8, v9 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v8i16: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 4 +; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, ma +; V512-NEXT: vwaddu.vv v9, v10, v8 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v8 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <8 x i16> %x, <8 x i16> poison, <8 x i32> + ret <8 x i16> %a +} + +define <8 x i32> @unary_interleave_v8i32(<8 x i32> %x) { +; V128-LABEL: unary_interleave_v8i32: +; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; V128-NEXT: vslidedown.vi v12, v8, 4 +; V128-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; V128-NEXT: vwaddu.vv v10, v8, v12 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v10, a0, v12 +; V128-NEXT: vmv2r.v v8, v10 +; V128-NEXT: ret +; +; V512-LABEL: unary_interleave_v8i32: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V512-NEXT: vslidedown.vi v10, v8, 4 +; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, ma +; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vmv1r.v v8, v9 +; V512-NEXT: ret + %a = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> + ret <8 x i32> %a +} -- 2.7.4