From 60d6be5dd3f411cfe1b5392cbbd6131d0ade2faa Mon Sep 17 00:00:00 2001
From: Bradley Smith <bradley.smith@arm.com>
Date: Wed, 29 Jun 2022 10:26:17 +0000
Subject: [PATCH] [LegalizeTypes] Replace vecreduce_xor/or/and with
 vecreduce_add/umax/umin if not legal

This is done during type legalization since the target representation of
these nodes may not be valid until after type legalization, and after
type legalization the fact that these are dealing with i1 types may be
lost.

Differential Revision: https://reviews.llvm.org/D128996
---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  36 +++++-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  10 --
 llvm/test/CodeGen/AArch64/reduce-and.ll       | 114 +++++++++++-------
 llvm/test/CodeGen/AArch64/reduce-or.ll        | 114 +++++++++++-------
 llvm/test/CodeGen/AArch64/reduce-xor.ll       |  61 ++++++++++
 .../AArch64/vecreduce-and-legalization.ll     |  10 +-
 .../AArch64/vecreduce-umax-legalization.ll    |  10 +-
 7 files changed, 233 insertions(+), 122 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a2f559934bd3..343722a97c3c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2259,16 +2259,40 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VECREDUCE(SDNode *N) {
   SDLoc dl(N);
   SDValue Op = PromoteIntOpVectorReduction(N, N->getOperand(0));
 
-  EVT EltVT = Op.getValueType().getVectorElementType();
-  EVT VT = N->getValueType(0);
+  EVT OrigEltVT = N->getOperand(0).getValueType().getVectorElementType();
+  EVT InVT = Op.getValueType();
+  EVT EltVT = InVT.getVectorElementType();
+  EVT ResVT = N->getValueType(0);
+  unsigned Opcode = N->getOpcode();
 
-  if (VT.bitsGE(EltVT))
-    return DAG.getNode(N->getOpcode(), SDLoc(N), VT, Op);
+  // An i1 vecreduce_xor is equivalent to vecreduce_add, use that instead if
+  // vecreduce_xor is not legal
+  if (Opcode == ISD::VECREDUCE_XOR && OrigEltVT == MVT::i1 &&
+      !TLI.isOperationLegalOrCustom(ISD::VECREDUCE_XOR, InVT) &&
+      TLI.isOperationLegalOrCustom(ISD::VECREDUCE_ADD, InVT))
+    Opcode = ISD::VECREDUCE_ADD;
+
+  // An i1 vecreduce_or is equivalent to vecreduce_umax, use that instead if
+  // vecreduce_or is not legal
+  else if (Opcode == ISD::VECREDUCE_OR && OrigEltVT == MVT::i1 &&
+      !TLI.isOperationLegalOrCustom(ISD::VECREDUCE_OR, InVT) &&
+      TLI.isOperationLegalOrCustom(ISD::VECREDUCE_UMAX, InVT))
+    Opcode = ISD::VECREDUCE_UMAX;
+
+  // An i1 vecreduce_and is equivalent to vecreduce_umin, use that instead if
+  // vecreduce_and is not legal
+  else if (Opcode == ISD::VECREDUCE_AND && OrigEltVT == MVT::i1 &&
+      !TLI.isOperationLegalOrCustom(ISD::VECREDUCE_AND, InVT) &&
+      TLI.isOperationLegalOrCustom(ISD::VECREDUCE_UMIN, InVT))
+    Opcode = ISD::VECREDUCE_UMIN;
+
+  if (ResVT.bitsGE(EltVT))
+    return DAG.getNode(Opcode, SDLoc(N), ResVT, Op);
 
   // Result size must be >= element size. If this is not the case after
   // promotion, also promote the result type and then truncate.
-  SDValue Reduce = DAG.getNode(N->getOpcode(), dl, EltVT, Op);
-  return DAG.getNode(ISD::TRUNCATE, dl, VT, Reduce);
+  SDValue Reduce = DAG.getNode(Opcode, dl, EltVT, Op);
+  return DAG.getNode(ISD::TRUNCATE, dl, ResVT, Reduce);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_VP_REDUCE(SDNode *N, unsigned OpNo) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 105a7c868738..8ac99cfc4295 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14863,16 +14863,6 @@ static SDValue performANDCombine(SDNode *N,
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
-  // Although NEON has no EORV instruction, when only the least significant bit
-  // is required the operation is synonymous with ADDV.
-  if (LHS.getOpcode() == ISD::VECREDUCE_XOR && isOneConstant(RHS) &&
-      LHS.getOperand(0).getValueType().isFixedLengthVector() &&
-      LHS.hasOneUse()) {
-    SDLoc DL(N);
-    SDValue ADDV = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, LHS.getOperand(0));
-    return DAG.getNode(ISD::AND, DL, VT, ADDV, RHS);
-  }
-
   if (VT.isScalableVector())
     return performSVEAndCombine(N, DCI);
 
diff --git a/llvm/test/CodeGen/AArch64/reduce-and.ll b/llvm/test/CodeGen/AArch64/reduce-and.ll
index 60546cf128af..05b1d2946685 100644
--- a/llvm/test/CodeGen/AArch64/reduce-and.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-and.ll
@@ -20,10 +20,8 @@ define i1 @test_redand_v1i1(<1 x i1> %a) {
 define i1 @test_redand_v2i1(<2 x i1> %a) {
 ; CHECK-LABEL: test_redand_v2i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    and w8, w9, w8
+; CHECK-NEXT:    uminp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
 ;
@@ -43,14 +41,8 @@ define i1 @test_redand_v2i1(<2 x i1> %a) {
 define i1 @test_redand_v4i1(<4 x i1> %a) {
 ; CHECK-LABEL: test_redand_v4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w8, v0.h[1]
-; CHECK-NEXT:    umov w9, v0.h[0]
-; CHECK-NEXT:    umov w10, v0.h[2]
-; CHECK-NEXT:    umov w11, v0.h[3]
-; CHECK-NEXT:    and w8, w9, w8
-; CHECK-NEXT:    and w8, w8, w10
-; CHECK-NEXT:    and w8, w8, w11
+; CHECK-NEXT:    uminv h0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
 ;
@@ -76,22 +68,8 @@ define i1 @test_redand_v4i1(<4 x i1> %a) {
 define i1 @test_redand_v8i1(<8 x i1> %a) {
 ; CHECK-LABEL: test_redand_v8i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w8, v0.b[1]
-; CHECK-NEXT:    umov w9, v0.b[0]
-; CHECK-NEXT:    umov w10, v0.b[2]
-; CHECK-NEXT:    umov w11, v0.b[3]
-; CHECK-NEXT:    umov w12, v0.b[4]
-; CHECK-NEXT:    umov w13, v0.b[5]
-; CHECK-NEXT:    and w8, w9, w8
-; CHECK-NEXT:    umov w9, v0.b[6]
-; CHECK-NEXT:    and w8, w8, w10
-; CHECK-NEXT:    umov w10, v0.b[7]
-; CHECK-NEXT:    and w8, w8, w11
-; CHECK-NEXT:    and w8, w8, w12
-; CHECK-NEXT:    and w8, w8, w13
-; CHECK-NEXT:    and w8, w8, w9
-; CHECK-NEXT:    and w8, w8, w10
+; CHECK-NEXT:    uminv b0, v0.8b
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
 ;
@@ -129,23 +107,8 @@ define i1 @test_redand_v8i1(<8 x i1> %a) {
 define i1 @test_redand_v16i1(<16 x i1> %a) {
 ; CHECK-LABEL: test_redand_v16i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    umov w8, v0.b[1]
-; CHECK-NEXT:    umov w9, v0.b[0]
-; CHECK-NEXT:    umov w10, v0.b[2]
-; CHECK-NEXT:    umov w11, v0.b[3]
-; CHECK-NEXT:    umov w12, v0.b[4]
-; CHECK-NEXT:    and w8, w9, w8
-; CHECK-NEXT:    umov w9, v0.b[5]
-; CHECK-NEXT:    and w8, w8, w10
-; CHECK-NEXT:    umov w10, v0.b[6]
-; CHECK-NEXT:    and w8, w8, w11
-; CHECK-NEXT:    umov w11, v0.b[7]
-; CHECK-NEXT:    and w8, w8, w12
-; CHECK-NEXT:    and w8, w8, w9
-; CHECK-NEXT:    and w8, w8, w10
-; CHECK-NEXT:    and w8, w8, w11
+; CHECK-NEXT:    uminv b0, v0.16b
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
 ;
@@ -203,6 +166,67 @@ define i1 @test_redand_v16i1(<16 x i1> %a) {
   ret i1 %or_result
 }
 
+define <16 x i1> @test_redand_ins_v16i1(<16 x i1> %a) {
+; CHECK-LABEL: test_redand_ins_v16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uminv b0, v0.16b
+; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redand_ins_v16i1:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov b1, v0.b[1]
+; GISEL-NEXT:    mov b2, v0.b[2]
+; GISEL-NEXT:    mov b3, v0.b[3]
+; GISEL-NEXT:    mov b4, v0.b[4]
+; GISEL-NEXT:    mov b5, v0.b[5]
+; GISEL-NEXT:    mov b6, v0.b[6]
+; GISEL-NEXT:    mov b7, v0.b[7]
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    fmov w10, s2
+; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    mov b16, v0.b[8]
+; GISEL-NEXT:    mov b17, v0.b[9]
+; GISEL-NEXT:    mov b18, v0.b[10]
+; GISEL-NEXT:    mov b19, v0.b[11]
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    fmov w11, s5
+; GISEL-NEXT:    fmov w12, s6
+; GISEL-NEXT:    fmov w13, s7
+; GISEL-NEXT:    mov b20, v0.b[12]
+; GISEL-NEXT:    mov b21, v0.b[13]
+; GISEL-NEXT:    mov b22, v0.b[14]
+; GISEL-NEXT:    mov b23, v0.b[15]
+; GISEL-NEXT:    and w10, w10, w11
+; GISEL-NEXT:    and w11, w12, w13
+; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    fmov w13, s17
+; GISEL-NEXT:    fmov w14, s18
+; GISEL-NEXT:    fmov w15, s19
+; GISEL-NEXT:    fmov w16, s22
+; GISEL-NEXT:    fmov w17, s23
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w12, w12, w13
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    and w13, w14, w15
+; GISEL-NEXT:    fmov w14, s20
+; GISEL-NEXT:    fmov w15, s21
+; GISEL-NEXT:    and w10, w12, w13
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w14, w14, w15
+; GISEL-NEXT:    and w15, w16, w17
+; GISEL-NEXT:    and w11, w14, w15
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    fmov s0, w8
+; GISEL-NEXT:    ret
+  %and_result = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a)
+  %ins = insertelement <16 x i1> poison, i1 %and_result, i64 0
+  ret <16 x i1> %ins
+}
+
 define i8 @test_redand_v1i8(<1 x i8> %a) {
 ; CHECK-LABEL: test_redand_v1i8:
 ; CHECK:       // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/reduce-or.ll b/llvm/test/CodeGen/AArch64/reduce-or.ll
index 73f5b044186a..c5419a6630b5 100644
--- a/llvm/test/CodeGen/AArch64/reduce-or.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-or.ll
@@ -20,10 +20,8 @@ define i1 @test_redor_v1i1(<1 x i1> %a) {
 define i1 @test_redor_v2i1(<2 x i1> %a) {
 ; CHECK-LABEL: test_redor_v2i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    orr w8, w9, w8
+; CHECK-NEXT:    umaxp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
 ;
@@ -43,14 +41,8 @@ define i1 @test_redor_v2i1(<2 x i1> %a) {
 define i1 @test_redor_v4i1(<4 x i1> %a) {
 ; CHECK-LABEL: test_redor_v4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w8, v0.h[1]
-; CHECK-NEXT:    umov w9, v0.h[0]
-; CHECK-NEXT:    umov w10, v0.h[2]
-; CHECK-NEXT:    umov w11, v0.h[3]
-; CHECK-NEXT:    orr w8, w9, w8
-; CHECK-NEXT:    orr w8, w8, w10
-; CHECK-NEXT:    orr w8, w8, w11
+; CHECK-NEXT:    umaxv h0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
 ;
@@ -76,22 +68,8 @@ define i1 @test_redor_v4i1(<4 x i1> %a) {
 define i1 @test_redor_v8i1(<8 x i1> %a) {
 ; CHECK-LABEL: test_redor_v8i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w8, v0.b[1]
-; CHECK-NEXT:    umov w9, v0.b[0]
-; CHECK-NEXT:    umov w10, v0.b[2]
-; CHECK-NEXT:    umov w11, v0.b[3]
-; CHECK-NEXT:    umov w12, v0.b[4]
-; CHECK-NEXT:    umov w13, v0.b[5]
-; CHECK-NEXT:    orr w8, w9, w8
-; CHECK-NEXT:    umov w9, v0.b[6]
-; CHECK-NEXT:    orr w8, w8, w10
-; CHECK-NEXT:    umov w10, v0.b[7]
-; CHECK-NEXT:    orr w8, w8, w11
-; CHECK-NEXT:    orr w8, w8, w12
-; CHECK-NEXT:    orr w8, w8, w13
-; CHECK-NEXT:    orr w8, w8, w9
-; CHECK-NEXT:    orr w8, w8, w10
+; CHECK-NEXT:    umaxv b0, v0.8b
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
 ;
@@ -129,23 +107,8 @@ define i1 @test_redor_v8i1(<8 x i1> %a) {
 define i1 @test_redor_v16i1(<16 x i1> %a) {
 ; CHECK-LABEL: test_redor_v16i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    umov w8, v0.b[1]
-; CHECK-NEXT:    umov w9, v0.b[0]
-; CHECK-NEXT:    umov w10, v0.b[2]
-; CHECK-NEXT:    umov w11, v0.b[3]
-; CHECK-NEXT:    umov w12, v0.b[4]
-; CHECK-NEXT:    orr w8, w9, w8
-; CHECK-NEXT:    umov w9, v0.b[5]
-; CHECK-NEXT:    orr w8, w8, w10
-; CHECK-NEXT:    umov w10, v0.b[6]
-; CHECK-NEXT:    orr w8, w8, w11
-; CHECK-NEXT:    umov w11, v0.b[7]
-; CHECK-NEXT:    orr w8, w8, w12
-; CHECK-NEXT:    orr w8, w8, w9
-; CHECK-NEXT:    orr w8, w8, w10
-; CHECK-NEXT:    orr w8, w8, w11
+; CHECK-NEXT:    umaxv b0, v0.16b
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
 ;
@@ -203,6 +166,67 @@ define i1 @test_redor_v16i1(<16 x i1> %a) {
   ret i1 %or_result
 }
 
+define <16 x i1> @test_redor_ins_v16i1(<16 x i1> %a) {
+; CHECK-LABEL: test_redor_ins_v16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umaxv b0, v0.16b
+; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_ins_v16i1:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov b1, v0.b[1]
+; GISEL-NEXT:    mov b2, v0.b[2]
+; GISEL-NEXT:    mov b3, v0.b[3]
+; GISEL-NEXT:    mov b4, v0.b[4]
+; GISEL-NEXT:    mov b5, v0.b[5]
+; GISEL-NEXT:    mov b6, v0.b[6]
+; GISEL-NEXT:    mov b7, v0.b[7]
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    fmov w10, s2
+; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    mov b16, v0.b[8]
+; GISEL-NEXT:    mov b17, v0.b[9]
+; GISEL-NEXT:    mov b18, v0.b[10]
+; GISEL-NEXT:    mov b19, v0.b[11]
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    fmov w11, s5
+; GISEL-NEXT:    fmov w12, s6
+; GISEL-NEXT:    fmov w13, s7
+; GISEL-NEXT:    mov b20, v0.b[12]
+; GISEL-NEXT:    mov b21, v0.b[13]
+; GISEL-NEXT:    mov b22, v0.b[14]
+; GISEL-NEXT:    mov b23, v0.b[15]
+; GISEL-NEXT:    orr w10, w10, w11
+; GISEL-NEXT:    orr w11, w12, w13
+; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    fmov w13, s17
+; GISEL-NEXT:    fmov w14, s18
+; GISEL-NEXT:    fmov w15, s19
+; GISEL-NEXT:    fmov w16, s22
+; GISEL-NEXT:    fmov w17, s23
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w12, w12, w13
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    orr w13, w14, w15
+; GISEL-NEXT:    fmov w14, s20
+; GISEL-NEXT:    fmov w15, s21
+; GISEL-NEXT:    orr w10, w12, w13
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w14, w14, w15
+; GISEL-NEXT:    orr w15, w16, w17
+; GISEL-NEXT:    orr w11, w14, w15
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    fmov s0, w8
+; GISEL-NEXT:    ret
+  %or_result = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
+  %ins = insertelement <16 x i1> poison, i1 %or_result, i64 0
+  ret <16 x i1> %ins
+}
+
 define i8 @test_redor_v1i8(<1 x i8> %a) {
 ; CHECK-LABEL: test_redor_v1i8:
 ; CHECK:       // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/reduce-xor.ll b/llvm/test/CodeGen/AArch64/reduce-xor.ll
index 8c3db972734d..4372dee4ab2f 100644
--- a/llvm/test/CodeGen/AArch64/reduce-xor.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-xor.ll
@@ -165,6 +165,67 @@ define i1 @test_redxor_v16i1(<16 x i1> %a) {
   ret i1 %or_result
 }
 
+define <16 x i1> @test_redxor_ins_v16i1(<16 x i1> %a) {
+; CHECK-LABEL: test_redxor_ins_v16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redxor_ins_v16i1:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov b1, v0.b[1]
+; GISEL-NEXT:    mov b2, v0.b[2]
+; GISEL-NEXT:    mov b3, v0.b[3]
+; GISEL-NEXT:    mov b4, v0.b[4]
+; GISEL-NEXT:    mov b5, v0.b[5]
+; GISEL-NEXT:    mov b6, v0.b[6]
+; GISEL-NEXT:    mov b7, v0.b[7]
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    fmov w10, s2
+; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    mov b16, v0.b[8]
+; GISEL-NEXT:    mov b17, v0.b[9]
+; GISEL-NEXT:    mov b18, v0.b[10]
+; GISEL-NEXT:    mov b19, v0.b[11]
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    eor w9, w10, w11
+; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    fmov w11, s5
+; GISEL-NEXT:    fmov w12, s6
+; GISEL-NEXT:    fmov w13, s7
+; GISEL-NEXT:    mov b20, v0.b[12]
+; GISEL-NEXT:    mov b21, v0.b[13]
+; GISEL-NEXT:    mov b22, v0.b[14]
+; GISEL-NEXT:    mov b23, v0.b[15]
+; GISEL-NEXT:    eor w10, w10, w11
+; GISEL-NEXT:    eor w11, w12, w13
+; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    fmov w13, s17
+; GISEL-NEXT:    fmov w14, s18
+; GISEL-NEXT:    fmov w15, s19
+; GISEL-NEXT:    fmov w16, s22
+; GISEL-NEXT:    fmov w17, s23
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    eor w12, w12, w13
+; GISEL-NEXT:    eor w9, w10, w11
+; GISEL-NEXT:    eor w13, w14, w15
+; GISEL-NEXT:    fmov w14, s20
+; GISEL-NEXT:    fmov w15, s21
+; GISEL-NEXT:    eor w10, w12, w13
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    eor w14, w14, w15
+; GISEL-NEXT:    eor w15, w16, w17
+; GISEL-NEXT:    eor w11, w14, w15
+; GISEL-NEXT:    eor w9, w10, w11
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    fmov s0, w8
+; GISEL-NEXT:    ret
+  %xor_result = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a)
+  %ins = insertelement <16 x i1> poison, i1 %xor_result, i64 0
+  ret <16 x i1> %ins
+}
+
 define i8 @test_redxor_v1i8(<1 x i8> %a) {
 ; CHECK-LABEL: test_redxor_v1i8:
 ; CHECK:       // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
index 5d87506177e0..005b4e3afe80 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
@@ -140,14 +140,8 @@ define i32 @test_v3i32(<3 x i32> %a) nounwind {
 define i1 @test_v4i1(<4 x i1> %a) nounwind {
 ; CHECK-LABEL: test_v4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w8, v0.h[1]
-; CHECK-NEXT:    umov w9, v0.h[0]
-; CHECK-NEXT:    umov w10, v0.h[2]
-; CHECK-NEXT:    umov w11, v0.h[3]
-; CHECK-NEXT:    and w8, w9, w8
-; CHECK-NEXT:    and w8, w8, w10
-; CHECK-NEXT:    and w8, w8, w11
+; CHECK-NEXT:    uminv h0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
   %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
index f4de4d98b087..4d9a1f84cedc 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
@@ -142,14 +142,8 @@ define i32 @test_v3i32(<3 x i32> %a) nounwind {
 define i1 @test_v4i1(<4 x i1> %a) nounwind {
 ; CHECK-LABEL: test_v4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w8, v0.h[1]
-; CHECK-NEXT:    umov w9, v0.h[0]
-; CHECK-NEXT:    umov w10, v0.h[2]
-; CHECK-NEXT:    umov w11, v0.h[3]
-; CHECK-NEXT:    orr w8, w9, w8
-; CHECK-NEXT:    orr w8, w8, w10
-; CHECK-NEXT:    orr w8, w8, w11
+; CHECK-NEXT:    umaxv h0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
   %b = call i1 @llvm.vector.reduce.umax.v4i1(<4 x i1> %a)
-- 
2.34.1