From 42944abf8583fc6efae5bbc39f092bf884f6d17c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 17 Feb 2023 10:00:40 -0800
Subject: [PATCH] [RISCV] Improve isInterleaveShuffle to handle interleaving
 the high half and low half of the same source.

This is needed to support the new interleave intrinsics from D141924 for
fixed vectors.

I've reworked the core loop to operate in terms of half of a source. Making 4
possible half sources. The first element of the half is used to indicate which
source using the same numbering as the shuffle where the second source elements
are numbered after the first source.

I've added restrictions to only match the first half of two vectors or the
first and second half of a single vector. This was done to prevent regressions
on the cases we have coverage for. I saw cases where generic DAG combine split
a single interleave into 2 smaller interleaves a concat. We can revisit in the
future.

Reviewed By: reames

Differential Revision: https://reviews.llvm.org/D144143
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp        |  90 +++++----
 .../RISCV/rvv/fixed-vectors-fp-interleave.ll       | 149 +++++++++++++++
 .../RISCV/rvv/fixed-vectors-int-interleave.ll      | 201 +++++++++++++++++++++
 3 files changed, 404 insertions(+), 36 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 087dd3b..9f053b9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2901,13 +2901,20 @@ static SDValue lowerScalarInsert(SDValue Scalar, SDValue VL,
 
 }
 
-static bool isInterleaveShuffle(ArrayRef<int> Mask, MVT VT, bool &SwapSources,
-                                const RISCVSubtarget &Subtarget) {
+/// Is this shuffle interleaving contiguous elements from one vector into the
+/// even elements and contiguous elements from another vector into the odd
+/// elements. \p Src1 will contain the element that should be in the first even
+/// element. \p Src2 will contain the element that should be in the first odd
+/// element. These can be the first element in a source or the element half
+/// way through the source.
+static bool isInterleaveShuffle(ArrayRef<int> Mask, MVT VT, int &EvenSrc,
+                                int &OddSrc, const RISCVSubtarget &Subtarget) {
   // We need to be able to widen elements to the next larger integer type.
   if (VT.getScalarSizeInBits() >= Subtarget.getELEN())
     return false;
 
   int Size = Mask.size();
+  int HalfSize = Size / 2;
   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
 
   int Srcs[] = {-1, -1};
@@ -2919,8 +2926,8 @@ static bool isInterleaveShuffle(ArrayRef<int> Mask, MVT VT, bool &SwapSources,
     // Is this an even or odd element.
     int Pol = i % 2;
 
-    // Ensure we consistently use the same source for this element polarity.
-    int Src = Mask[i] / Size;
+    // Ensure we consistently use the same half source for this polarity.
+    int Src = alignDown(Mask[i], HalfSize);
     if (Srcs[Pol] < 0)
       Srcs[Pol] = Src;
     if (Srcs[Pol] != Src)
@@ -2928,17 +2935,24 @@ static bool isInterleaveShuffle(ArrayRef<int> Mask, MVT VT, bool &SwapSources,
 
     // Make sure the element within the source is appropriate for this element
     // in the destination.
-    int Elt = Mask[i] % Size;
+    int Elt = Mask[i] % HalfSize;
     if (Elt != i / 2)
       return false;
   }
 
-  // We need to find a source for each polarity and they can't be the same.
-  if (Srcs[0] < 0 || Srcs[1] < 0 || Srcs[0] == Srcs[1])
+  // One source should be low half of first vector.
+  if (Srcs[0] != 0 && Srcs[1] != 0)
+    return false;
+
+  // Other source should be the upper half of the first source or the lower
+  // half of the second source.
+  // FIXME: This is only a heuristic to avoid regressions.
+  if (Srcs[0] != HalfSize && Srcs[0] != Size && Srcs[1] != HalfSize &&
+      Srcs[1] != Size)
     return false;
 
-  // Swap the sources if the second source was in the even polarity.
-  SwapSources = Srcs[0] > Srcs[1];
+  EvenSrc = Srcs[0];
+  OddSrc = Srcs[1];
 
   return true;
 }
@@ -3338,18 +3352,22 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
 
   // Detect an interleave shuffle and lower to
   // (vmaccu.vx (vwaddu.vx lohalf(V1), lohalf(V2)), lohalf(V2), (2^eltbits - 1))
-  bool SwapSources;
-  if (isInterleaveShuffle(Mask, VT, SwapSources, Subtarget)) {
-    // Swap sources if needed.
-    if (SwapSources)
-      std::swap(V1, V2);
-
-    // Extract the lower half of the vectors.
+  int EvenSrc, OddSrc;
+  if (isInterleaveShuffle(Mask, VT, EvenSrc, OddSrc, Subtarget)) {
+    // Extract the halves of the vectors.
     MVT HalfVT = VT.getHalfNumVectorElementsVT();
-    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
-                     DAG.getConstant(0, DL, XLenVT));
-    V2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V2,
-                     DAG.getConstant(0, DL, XLenVT));
+
+    int Size = Mask.size();
+    SDValue EvenV, OddV;
+    assert(EvenSrc >= 0 && "Undef source?");
+    EvenV = (EvenSrc / Size) == 0 ? V1 : V2;
+    EvenV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, EvenV,
+                        DAG.getConstant(EvenSrc % Size, DL, XLenVT));
+
+    assert(OddSrc >= 0 && "Undef source?");
+    OddV = (OddSrc / Size) == 0 ? V1 : V2;
+    OddV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, OddV,
+                       DAG.getConstant(OddSrc % Size, DL, XLenVT));
 
     // Double the element width and halve the number of elements in an int type.
     unsigned EltBits = VT.getScalarSizeInBits();
@@ -3365,36 +3383,37 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     // larger type.
     MVT HalfContainerVT = MVT::getVectorVT(
         VT.getVectorElementType(), WideIntContainerVT.getVectorElementCount());
-    V1 = convertToScalableVector(HalfContainerVT, V1, DAG, Subtarget);
-    V2 = convertToScalableVector(HalfContainerVT, V2, DAG, Subtarget);
+    EvenV = convertToScalableVector(HalfContainerVT, EvenV, DAG, Subtarget);
+    OddV = convertToScalableVector(HalfContainerVT, OddV, DAG, Subtarget);
 
     // Cast sources to integer.
     MVT IntEltVT = MVT::getIntegerVT(EltBits);
     MVT IntHalfVT =
         MVT::getVectorVT(IntEltVT, HalfContainerVT.getVectorElementCount());
-    V1 = DAG.getBitcast(IntHalfVT, V1);
-    V2 = DAG.getBitcast(IntHalfVT, V2);
+    EvenV = DAG.getBitcast(IntHalfVT, EvenV);
+    OddV = DAG.getBitcast(IntHalfVT, OddV);
 
-    // Freeze V2 since we use it twice and we need to be sure that the add and
+    // Freeze OddV since we use it twice and we need to be sure that the add and
     // multiply see the same value.
-    V2 = DAG.getFreeze(V2);
+    OddV = DAG.getFreeze(OddV);
 
     // Recreate TrueMask using the widened type's element count.
     TrueMask = getAllOnesMask(HalfContainerVT, VL, DL, DAG);
 
-    // Widen V1 and V2 with 0s and add one copy of V2 to V1.
+    // Widen EvenV and OddV with 0s and add one copy of OddV to EvenV.
     SDValue Add =
-        DAG.getNode(RISCVISD::VWADDU_VL, DL, WideIntContainerVT, V1, V2,
+        DAG.getNode(RISCVISD::VWADDU_VL, DL, WideIntContainerVT, EvenV, OddV,
                     DAG.getUNDEF(WideIntContainerVT), TrueMask, VL);
-    // Create 2^eltbits - 1 copies of V2 by multiplying by the largest integer.
+    // Create 2^eltbits - 1 copies of OddV by multiplying by the largest
+    // integer.
     SDValue Multiplier = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntHalfVT,
                                      DAG.getUNDEF(IntHalfVT),
                                      DAG.getAllOnesConstant(DL, XLenVT), VL);
     SDValue WidenMul =
-        DAG.getNode(RISCVISD::VWMULU_VL, DL, WideIntContainerVT, V2, Multiplier,
-                    DAG.getUNDEF(WideIntContainerVT), TrueMask, VL);
+        DAG.getNode(RISCVISD::VWMULU_VL, DL, WideIntContainerVT, OddV,
+                    Multiplier, DAG.getUNDEF(WideIntContainerVT), TrueMask, VL);
     // Add the new copies to our previous addition giving us 2^eltbits copies of
-    // V2. This is equivalent to shifting V2 left by eltbits. This should
+    // OddV. This is equivalent to shifting OddV left by eltbits. This should
     // combine with the vwmulu.vv above to form vwmaccu.vv.
     Add = DAG.getNode(RISCVISD::ADD_VL, DL, WideIntContainerVT, Add, WidenMul,
                       DAG.getUNDEF(WideIntContainerVT), TrueMask, VL);
@@ -3555,10 +3574,9 @@ bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
 
   MVT SVT = VT.getSimpleVT();
 
-  bool SwapSources;
-  int LoSrc, HiSrc;
-  return (isElementRotate(LoSrc, HiSrc, M) > 0) ||
-         isInterleaveShuffle(M, SVT, SwapSources, Subtarget);
+  int Dummy1, Dummy2;
+  return (isElementRotate(Dummy1, Dummy2, M) > 0) ||
+         isInterleaveShuffle(M, SVT, Dummy1, Dummy2, Subtarget);
 }
 
 // Lower CTLZ_ZERO_UNDEF or CTTZ_ZERO_UNDEF by converting to FP and extracting
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
index 334a3d2..b9050ae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
@@ -370,3 +370,152 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
   %a = shufflevector <32 x float> %x, <32 x float> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   ret <64 x float> %a
 }
+
+define <4 x half> @unary_interleave_v4f16(<4 x half> %x) {
+; V128-LABEL: unary_interleave_v4f16:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
+; V128-NEXT:    vslidedown.vi v10, v8, 2
+; V128-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V128-NEXT:    vwaddu.vv v9, v8, v10
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v9, a0, v10
+; V128-NEXT:    vmv1r.v v8, v9
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v4f16:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 2
+; V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v8, v10
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v10
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <4 x half> %x, <4 x half> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x half> %a
+}
+
+define <4 x float> @unary_interleave_v4f32(<4 x float> %x) {
+; V128-LABEL: unary_interleave_v4f32:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; V128-NEXT:    vslidedown.vi v10, v8, 2
+; V128-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
+; V128-NEXT:    vwaddu.vv v9, v8, v10
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v9, a0, v10
+; V128-NEXT:    vmv1r.v v8, v9
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v4f32:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 2
+; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v8, v10
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v10
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %a
+}
+
+; FIXME: Is there better codegen we can do here?
+define <4 x double> @unary_interleave_v4f64(<4 x double> %x) {
+; RV32-V128-LABEL: unary_interleave_v4f64:
+; RV32-V128:       # %bb.0:
+; RV32-V128-NEXT:    lui a0, %hi(.LCPI13_0)
+; RV32-V128-NEXT:    addi a0, a0, %lo(.LCPI13_0)
+; RV32-V128-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-V128-NEXT:    vle16.v v12, (a0)
+; RV32-V128-NEXT:    vrgatherei16.vv v10, v8, v12
+; RV32-V128-NEXT:    vmv.v.v v8, v10
+; RV32-V128-NEXT:    ret
+;
+; RV64-V128-LABEL: unary_interleave_v4f64:
+; RV64-V128:       # %bb.0:
+; RV64-V128-NEXT:    lui a0, %hi(.LCPI13_0)
+; RV64-V128-NEXT:    addi a0, a0, %lo(.LCPI13_0)
+; RV64-V128-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-V128-NEXT:    vle64.v v12, (a0)
+; RV64-V128-NEXT:    vrgather.vv v10, v8, v12
+; RV64-V128-NEXT:    vmv.v.v v8, v10
+; RV64-V128-NEXT:    ret
+;
+; RV32-V512-LABEL: unary_interleave_v4f64:
+; RV32-V512:       # %bb.0:
+; RV32-V512-NEXT:    lui a0, %hi(.LCPI13_0)
+; RV32-V512-NEXT:    addi a0, a0, %lo(.LCPI13_0)
+; RV32-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
+; RV32-V512-NEXT:    vle16.v v10, (a0)
+; RV32-V512-NEXT:    vrgatherei16.vv v9, v8, v10
+; RV32-V512-NEXT:    vmv.v.v v8, v9
+; RV32-V512-NEXT:    ret
+;
+; RV64-V512-LABEL: unary_interleave_v4f64:
+; RV64-V512:       # %bb.0:
+; RV64-V512-NEXT:    lui a0, %hi(.LCPI13_0)
+; RV64-V512-NEXT:    addi a0, a0, %lo(.LCPI13_0)
+; RV64-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
+; RV64-V512-NEXT:    vle64.v v10, (a0)
+; RV64-V512-NEXT:    vrgather.vv v9, v8, v10
+; RV64-V512-NEXT:    vmv.v.v v8, v9
+; RV64-V512-NEXT:    ret
+  %a = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x double> %a
+}
+
+define <8 x half> @unary_interleave_v8f16(<8 x half> %x) {
+; V128-LABEL: unary_interleave_v8f16:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
+; V128-NEXT:    vslidedown.vi v10, v8, 4
+; V128-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; V128-NEXT:    vwaddu.vv v9, v8, v10
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v9, a0, v10
+; V128-NEXT:    vmv1r.v v8, v9
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v8f16:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 4
+; V512-NEXT:    vsetivli zero, 8, e16, mf4, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v8, v10
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v10
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <8 x half> %x, <8 x half> poison, <8 x i32> <i32 0, i32 4, i32 undef, i32 5, i32 2, i32 undef, i32 3, i32 7>
+  ret <8 x half> %a
+}
+
+define <8 x float> @unary_interleave_v8f32(<8 x float> %x) {
+; V128-LABEL: unary_interleave_v8f32:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
+; V128-NEXT:    vslidedown.vi v12, v8, 4
+; V128-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
+; V128-NEXT:    vwaddu.vv v10, v12, v8
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v10, a0, v8
+; V128-NEXT:    vmv2r.v v8, v10
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v8f32:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 4
+; V512-NEXT:    vsetivli zero, 8, e32, mf2, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v10, v8
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v8
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> <i32 4, i32 0, i32 undef, i32 1, i32 6, i32 undef, i32 7, i32 3>
+  ret <8 x float> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
index 24e7c15..c148faf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -476,3 +476,204 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
   %a = shufflevector <32 x i32> %x, <32 x i32> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   ret <64 x i32> %a
 }
+
+define <4 x i8> @unary_interleave_v4i8(<4 x i8> %x) {
+; V128-LABEL: unary_interleave_v4i8:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; V128-NEXT:    vslidedown.vi v10, v8, 2
+; V128-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
+; V128-NEXT:    vwaddu.vv v9, v8, v10
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v9, a0, v10
+; V128-NEXT:    vmv1r.v v8, v9
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v4i8:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 2
+; V512-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v8, v10
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v10
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x i8> %a
+}
+
+define <4 x i16> @unary_interleave_v4i16(<4 x i16> %x) {
+; V128-LABEL: unary_interleave_v4i16:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
+; V128-NEXT:    vslidedown.vi v10, v8, 2
+; V128-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V128-NEXT:    vwaddu.vv v9, v8, v10
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v9, a0, v10
+; V128-NEXT:    vmv1r.v v8, v9
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v4i16:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 2
+; V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v8, v10
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v10
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <4 x i16> %x, <4 x i16> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x i16> %a
+}
+
+define <4 x i32> @unary_interleave_v4i32(<4 x i32> %x) {
+; V128-LABEL: unary_interleave_v4i32:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; V128-NEXT:    vslidedown.vi v10, v8, 2
+; V128-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
+; V128-NEXT:    vwaddu.vv v9, v8, v10
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v9, a0, v10
+; V128-NEXT:    vmv1r.v v8, v9
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v4i32:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 2
+; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v8, v10
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v10
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x i32> %a
+}
+
+; FIXME: Is there better codegen we can do here?
+define <4 x i64> @unary_interleave_v4i64(<4 x i64> %x) {
+; RV32-V128-LABEL: unary_interleave_v4i64:
+; RV32-V128:       # %bb.0:
+; RV32-V128-NEXT:    lui a0, %hi(.LCPI19_0)
+; RV32-V128-NEXT:    addi a0, a0, %lo(.LCPI19_0)
+; RV32-V128-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-V128-NEXT:    vle16.v v12, (a0)
+; RV32-V128-NEXT:    vrgatherei16.vv v10, v8, v12
+; RV32-V128-NEXT:    vmv.v.v v8, v10
+; RV32-V128-NEXT:    ret
+;
+; RV64-V128-LABEL: unary_interleave_v4i64:
+; RV64-V128:       # %bb.0:
+; RV64-V128-NEXT:    lui a0, %hi(.LCPI19_0)
+; RV64-V128-NEXT:    addi a0, a0, %lo(.LCPI19_0)
+; RV64-V128-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-V128-NEXT:    vle64.v v12, (a0)
+; RV64-V128-NEXT:    vrgather.vv v10, v8, v12
+; RV64-V128-NEXT:    vmv.v.v v8, v10
+; RV64-V128-NEXT:    ret
+;
+; RV32-V512-LABEL: unary_interleave_v4i64:
+; RV32-V512:       # %bb.0:
+; RV32-V512-NEXT:    lui a0, %hi(.LCPI19_0)
+; RV32-V512-NEXT:    addi a0, a0, %lo(.LCPI19_0)
+; RV32-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
+; RV32-V512-NEXT:    vle16.v v10, (a0)
+; RV32-V512-NEXT:    vrgatherei16.vv v9, v8, v10
+; RV32-V512-NEXT:    vmv.v.v v8, v9
+; RV32-V512-NEXT:    ret
+;
+; RV64-V512-LABEL: unary_interleave_v4i64:
+; RV64-V512:       # %bb.0:
+; RV64-V512-NEXT:    lui a0, %hi(.LCPI19_0)
+; RV64-V512-NEXT:    addi a0, a0, %lo(.LCPI19_0)
+; RV64-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
+; RV64-V512-NEXT:    vle64.v v10, (a0)
+; RV64-V512-NEXT:    vrgather.vv v9, v8, v10
+; RV64-V512-NEXT:    vmv.v.v v8, v9
+; RV64-V512-NEXT:    ret
+  %a = shufflevector <4 x i64> %x, <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x i64> %a
+}
+
+define <8 x i8> @unary_interleave_v8i8(<8 x i8> %x) {
+; V128-LABEL: unary_interleave_v8i8:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; V128-NEXT:    vslidedown.vi v10, v8, 4
+; V128-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; V128-NEXT:    vwaddu.vv v9, v8, v10
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v9, a0, v10
+; V128-NEXT:    vmv1r.v v8, v9
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v8i8:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 4
+; V512-NEXT:    vsetivli zero, 8, e8, mf8, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v8, v10
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v10
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <8 x i8> %x, <8 x i8> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 undef, i32 6, i32 3, i32 7>
+  ret <8 x i8> %a
+}
+
+define <8 x i16> @unary_interleave_v8i16(<8 x i16> %x) {
+; V128-LABEL: unary_interleave_v8i16:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
+; V128-NEXT:    vslidedown.vi v10, v8, 4
+; V128-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; V128-NEXT:    vwaddu.vv v9, v10, v8
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v9, a0, v8
+; V128-NEXT:    vmv1r.v v8, v9
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v8i16:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 4
+; V512-NEXT:    vsetivli zero, 8, e16, mf4, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v10, v8
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v8
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <8 x i16> %x, <8 x i16> poison, <8 x i32> <i32 4, i32 undef, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
+  ret <8 x i16> %a
+}
+
+define <8 x i32> @unary_interleave_v8i32(<8 x i32> %x) {
+; V128-LABEL: unary_interleave_v8i32:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
+; V128-NEXT:    vslidedown.vi v12, v8, 4
+; V128-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
+; V128-NEXT:    vwaddu.vv v10, v8, v12
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v10, a0, v12
+; V128-NEXT:    vmv2r.v v8, v10
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v8i32:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 4
+; V512-NEXT:    vsetivli zero, 8, e32, mf2, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v8, v10
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v10
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x i32> %a
+}
-- 
2.7.4