From 1206f72e31f6d67069f1c90c4871b229923008a4 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 2 Aug 2022 13:01:59 +0100 Subject: [PATCH] [AArch64] Fold Mul(And(Srl(X, 15), 0x10001), 0xffff) to CMLTz This folds a v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) into a v8i16 CMLTz instruction. The Srl and And extract the top bit (whether the input is negative) and the Mul sets all values in the i16 half to all 1/0 depending on if that top bit was set. This is equivalent to a v8i16 CMLTz instruction. The same applies to other sizes with equivalent constants. Differential Revision: https://reviews.llvm.org/D130874 --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 37 +++++++ llvm/test/CodeGen/AArch64/insert-extend.ll | 66 +++++------- llvm/test/CodeGen/AArch64/mulcmle.ll | 54 ++-------- llvm/test/CodeGen/AArch64/reduce-shuffle.ll | 136 +++++++++--------------- 4 files changed, 126 insertions(+), 167 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ef2edf2..fd845fa 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14268,12 +14268,49 @@ static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) { Op1 ? Op1 : Mul->getOperand(1)); } +// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz +// Same for other types with equivalent constants. +static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 && + VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16) + return SDValue(); + if (N->getOperand(0).getOpcode() != ISD::AND || + N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL) + return SDValue(); + + SDValue And = N->getOperand(0); + SDValue Srl = And.getOperand(0); + + APInt V1, V2, V3; + if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) || + !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) || + !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3)) + return SDValue(); + + unsigned HalfSize = VT.getScalarSizeInBits() / 2; + if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) || + V3 != (HalfSize - 1)) + return SDValue(); + + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), + EVT::getIntegerVT(*DAG.getContext(), HalfSize), + VT.getVectorElementCount() * 2); + + SDLoc DL(N); + SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0)); + SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In); + return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM); +} + static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (SDValue Ext = performMulVectorExtendCombine(N, DAG)) return Ext; + if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG)) + return Ext; if (DCI.isBeforeLegalizeOps()) return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll index 7cd4c65..c27ed21 100644 --- a/llvm/test/CodeGen/AArch64/insert-extend.ll +++ b/llvm/test/CodeGen/AArch64/insert-extend.ll @@ -118,58 +118,48 @@ define i32 @large(i8* nocapture noundef readonly %p1, i32 noundef %st1, i8* noca ; CHECK-NEXT: zip1 v16.4s, v6.4s, v6.4s ; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s ; CHECK-NEXT: ext v17.16b, v1.16b, v3.16b, #8 -; CHECK-NEXT: ext v5.16b, v3.16b, v2.16b, #4 -; CHECK-NEXT: ext v7.16b, v6.16b, v4.16b, #4 +; CHECK-NEXT: ext v5.16b, v6.16b, v4.16b, #4 +; CHECK-NEXT: ext v7.16b, v3.16b, v2.16b, #4 ; CHECK-NEXT: ext v18.16b, v0.16b, v6.16b, #4 ; CHECK-NEXT: trn2 v0.4s, v16.4s, v0.4s ; CHECK-NEXT: ext v16.16b, v17.16b, v1.16b, #4 -; CHECK-NEXT: zip2 v5.4s, v5.4s, v3.4s -; CHECK-NEXT: zip2 v7.4s, v7.4s, v6.4s +; CHECK-NEXT: zip2 v7.4s, v7.4s, v3.4s +; CHECK-NEXT: zip2 v5.4s, v5.4s, v6.4s ; CHECK-NEXT: ext v18.16b, v18.16b, v18.16b, #4 ; CHECK-NEXT: mov v1.s[2], v3.s[1] -; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #12 -; CHECK-NEXT: ext v7.16b, v4.16b, v7.16b, #12 +; CHECK-NEXT: uzp2 v16.4s, v17.4s, v16.4s +; CHECK-NEXT: ext v7.16b, v2.16b, v7.16b, #12 +; CHECK-NEXT: ext v5.16b, v4.16b, v5.16b, #12 ; CHECK-NEXT: mov v2.s[2], v3.s[3] ; CHECK-NEXT: mov v4.s[2], v6.s[3] -; CHECK-NEXT: uzp2 v16.4s, v17.4s, v16.4s -; CHECK-NEXT: sub v19.4s, v0.4s, v18.4s +; CHECK-NEXT: sub v17.4s, v0.4s, v18.4s ; CHECK-NEXT: mov v18.s[0], v6.s[1] -; CHECK-NEXT: sub v17.4s, v2.4s, v5.4s -; CHECK-NEXT: sub v20.4s, v4.4s, v7.4s -; CHECK-NEXT: sub v21.4s, v1.4s, v16.4s +; CHECK-NEXT: sub v19.4s, v1.4s, v16.4s +; CHECK-NEXT: sub v20.4s, v2.4s, v7.4s +; CHECK-NEXT: sub v21.4s, v4.4s, v5.4s +; CHECK-NEXT: mov v1.s[1], v3.s[0] ; CHECK-NEXT: mov v2.s[1], v3.s[2] ; CHECK-NEXT: mov v4.s[1], v6.s[2] -; CHECK-NEXT: mov v1.s[1], v3.s[0] ; CHECK-NEXT: add v0.4s, v0.4s, v18.4s -; CHECK-NEXT: add v2.4s, v2.4s, v5.4s -; CHECK-NEXT: add v3.4s, v4.4s, v7.4s ; CHECK-NEXT: add v1.4s, v1.4s, v16.4s -; CHECK-NEXT: mov v0.d[1], v19.d[1] -; CHECK-NEXT: mov v1.d[1], v21.d[1] -; CHECK-NEXT: mov v2.d[1], v17.d[1] -; CHECK-NEXT: mov v3.d[1], v20.d[1] -; CHECK-NEXT: movi v4.8h, #1 -; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff -; CHECK-NEXT: ushr v5.4s, v2.4s, #15 -; CHECK-NEXT: ushr v6.4s, v0.4s, #15 -; CHECK-NEXT: ushr v7.4s, v3.4s, #15 -; CHECK-NEXT: ushr v16.4s, v1.4s, #15 -; CHECK-NEXT: and v6.16b, v6.16b, v4.16b -; CHECK-NEXT: and v16.16b, v16.16b, v4.16b -; CHECK-NEXT: and v7.16b, v7.16b, v4.16b -; CHECK-NEXT: and v4.16b, v5.16b, v4.16b -; CHECK-NEXT: mul v5.4s, v6.4s, v17.4s -; CHECK-NEXT: mul v6.4s, v16.4s, v17.4s -; CHECK-NEXT: mul v4.4s, v4.4s, v17.4s -; CHECK-NEXT: mul v7.4s, v7.4s, v17.4s -; CHECK-NEXT: add v0.4s, v5.4s, v0.4s -; CHECK-NEXT: add v1.4s, v6.4s, v1.4s +; CHECK-NEXT: add v2.4s, v2.4s, v7.4s +; CHECK-NEXT: add v3.4s, v4.4s, v5.4s +; CHECK-NEXT: mov v2.d[1], v20.d[1] +; CHECK-NEXT: mov v3.d[1], v21.d[1] +; CHECK-NEXT: mov v0.d[1], v17.d[1] +; CHECK-NEXT: mov v1.d[1], v19.d[1] +; CHECK-NEXT: cmlt v4.8h, v2.8h, #0 +; CHECK-NEXT: cmlt v5.8h, v3.8h, #0 +; CHECK-NEXT: cmlt v6.8h, v0.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v1.8h, #0 +; CHECK-NEXT: add v0.4s, v6.4s, v0.4s +; CHECK-NEXT: add v1.4s, v7.4s, v1.4s ; CHECK-NEXT: add v2.4s, v4.4s, v2.4s -; CHECK-NEXT: add v3.4s, v7.4s, v3.4s +; CHECK-NEXT: add v3.4s, v5.4s, v3.4s ; CHECK-NEXT: eor v2.16b, v2.16b, v4.16b -; CHECK-NEXT: eor v3.16b, v3.16b, v7.16b -; CHECK-NEXT: eor v1.16b, v1.16b, v6.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b +; CHECK-NEXT: eor v3.16b, v3.16b, v5.16b +; CHECK-NEXT: eor v1.16b, v1.16b, v7.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v6.16b ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: add v1.4s, v3.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s diff --git a/llvm/test/CodeGen/AArch64/mulcmle.ll b/llvm/test/CodeGen/AArch64/mulcmle.ll index 3b2c09c..8a359df 100644 --- a/llvm/test/CodeGen/AArch64/mulcmle.ll +++ b/llvm/test/CodeGen/AArch64/mulcmle.ll @@ -4,13 +4,7 @@ define <1 x i64> @v1i64(<1 x i64> %a) { ; CHECK-LABEL: v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2s, #1 -; CHECK-NEXT: ushr d0, d0, #31 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: lsl x9, x8, #32 -; CHECK-NEXT: sub x8, x9, x8 -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 ; CHECK-NEXT: ret %b = lshr <1 x i64> %a, %c = and <1 x i64> %b, @@ -21,17 +15,7 @@ define <1 x i64> @v1i64(<1 x i64> %a) { define <2 x i64> @v2i64(<2 x i64> %a) { ; CHECK-LABEL: v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: ushr v0.2d, v0.2d, #31 -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: mov x8, v0.d[1] -; CHECK-NEXT: lsl x10, x9, #32 -; CHECK-NEXT: sub x9, x10, x9 -; CHECK-NEXT: lsl x10, x8, #32 -; CHECK-NEXT: sub x8, x10, x8 -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 ; CHECK-NEXT: ret %b = lshr <2 x i64> %a, %c = and <2 x i64> %b, @@ -42,11 +26,7 @@ define <2 x i64> @v2i64(<2 x i64> %a) { define <2 x i32> @v2i32(<2 x i32> %a) { ; CHECK-LABEL: v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4h, #1 -; CHECK-NEXT: movi d2, #0x00ffff0000ffff -; CHECK-NEXT: ushr v0.2s, v0.2s, #15 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mul v0.2s, v0.2s, v2.2s +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 ; CHECK-NEXT: ret %b = lshr <2 x i32> %a, %c = and <2 x i32> %b, @@ -57,11 +37,7 @@ define <2 x i32> @v2i32(<2 x i32> %a) { define <4 x i32> @v4i32(<4 x i32> %a) { ; CHECK-LABEL: v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.8h, #1 -; CHECK-NEXT: ushr v0.4s, v0.4s, #15 -; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 ; CHECK-NEXT: ret %b = lshr <4 x i32> %a, %c = and <4 x i32> %b, @@ -72,14 +48,8 @@ define <4 x i32> @v4i32(<4 x i32> %a) { define <8 x i32> @v8i32(<8 x i32> %a) { ; CHECK-LABEL: v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.8h, #1 -; CHECK-NEXT: ushr v1.4s, v1.4s, #15 -; CHECK-NEXT: movi v3.2d, #0x00ffff0000ffff -; CHECK-NEXT: ushr v0.4s, v0.4s, #15 -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: mul v0.4s, v0.4s, v3.4s -; CHECK-NEXT: mul v1.4s, v1.4s, v3.4s +; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 ; CHECK-NEXT: ret %b = lshr <8 x i32> %a, %c = and <8 x i32> %b, @@ -90,11 +60,7 @@ define <8 x i32> @v8i32(<8 x i32> %a) { define <4 x i16> @v4i16(<4 x i16> %a) { ; CHECK-LABEL: v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.8b, #1 -; CHECK-NEXT: movi d2, #0xff00ff00ff00ff -; CHECK-NEXT: ushr v0.4h, v0.4h, #7 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mul v0.4h, v0.4h, v2.4h +; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 ; CHECK-NEXT: ret %b = lshr <4 x i16> %a, %c = and <4 x i16> %b, @@ -105,11 +71,7 @@ define <4 x i16> @v4i16(<4 x i16> %a) { define <8 x i16> @v8i16(<8 x i16> %a) { ; CHECK-LABEL: v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.16b, #1 -; CHECK-NEXT: ushr v0.8h, v0.8h, #7 -; CHECK-NEXT: movi v2.2d, #0xff00ff00ff00ff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: mul v0.8h, v0.8h, v2.8h +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 ; CHECK-NEXT: ret %b = lshr <8 x i16> %a, %c = and <8 x i16> %b, diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll index caba5a6..797f372 100644 --- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll @@ -93,47 +93,37 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: ext v17.16b, v3.16b, v17.16b, #12 ; CHECK-NEXT: mov v0.s[2], v2.s[1] ; CHECK-NEXT: uzp2 v4.4s, v4.4s, v18.4s -; CHECK-NEXT: mov v7.s[2], v2.s[3] ; CHECK-NEXT: mov v3.s[2], v5.s[3] +; CHECK-NEXT: mov v7.s[2], v2.s[3] ; CHECK-NEXT: sub v18.4s, v1.4s, v6.4s ; CHECK-NEXT: mov v6.s[0], v5.s[1] ; CHECK-NEXT: sub v19.4s, v0.4s, v4.4s -; CHECK-NEXT: sub v20.4s, v7.4s, v16.4s -; CHECK-NEXT: sub v21.4s, v3.4s, v17.4s +; CHECK-NEXT: sub v20.4s, v3.4s, v17.4s +; CHECK-NEXT: sub v21.4s, v7.4s, v16.4s ; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: mov v7.s[1], v2.s[2] ; CHECK-NEXT: mov v3.s[1], v5.s[2] +; CHECK-NEXT: mov v7.s[1], v2.s[2] ; CHECK-NEXT: add v1.4s, v1.4s, v6.4s ; CHECK-NEXT: add v0.4s, v0.4s, v4.4s -; CHECK-NEXT: add v2.4s, v7.4s, v16.4s -; CHECK-NEXT: add v3.4s, v3.4s, v17.4s -; CHECK-NEXT: mov v2.d[1], v20.d[1] -; CHECK-NEXT: mov v3.d[1], v21.d[1] +; CHECK-NEXT: add v2.4s, v3.4s, v17.4s +; CHECK-NEXT: add v3.4s, v7.4s, v16.4s ; CHECK-NEXT: mov v1.d[1], v18.d[1] ; CHECK-NEXT: mov v0.d[1], v19.d[1] -; CHECK-NEXT: movi v4.8h, #1 -; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff -; CHECK-NEXT: ushr v5.4s, v1.4s, #15 -; CHECK-NEXT: ushr v6.4s, v2.4s, #15 -; CHECK-NEXT: ushr v7.4s, v0.4s, #15 -; CHECK-NEXT: ushr v16.4s, v3.4s, #15 -; CHECK-NEXT: and v6.16b, v6.16b, v4.16b -; CHECK-NEXT: and v16.16b, v16.16b, v4.16b -; CHECK-NEXT: and v7.16b, v7.16b, v4.16b -; CHECK-NEXT: and v4.16b, v5.16b, v4.16b -; CHECK-NEXT: mul v5.4s, v6.4s, v17.4s -; CHECK-NEXT: mul v6.4s, v16.4s, v17.4s -; CHECK-NEXT: mul v4.4s, v4.4s, v17.4s -; CHECK-NEXT: mul v7.4s, v7.4s, v17.4s -; CHECK-NEXT: add v2.4s, v5.4s, v2.4s +; CHECK-NEXT: mov v3.d[1], v21.d[1] +; CHECK-NEXT: mov v2.d[1], v20.d[1] +; CHECK-NEXT: cmlt v4.8h, v1.8h, #0 +; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 +; CHECK-NEXT: cmlt v6.8h, v3.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v2.8h, #0 ; CHECK-NEXT: add v3.4s, v6.4s, v3.4s +; CHECK-NEXT: add v2.4s, v7.4s, v2.4s ; CHECK-NEXT: add v1.4s, v4.4s, v1.4s -; CHECK-NEXT: add v0.4s, v7.4s, v0.4s +; CHECK-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NEXT: eor v1.16b, v1.16b, v4.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v7.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b +; CHECK-NEXT: eor v2.16b, v2.16b, v7.16b ; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b -; CHECK-NEXT: eor v2.16b, v2.16b, v5.16b -; CHECK-NEXT: add v2.4s, v3.4s, v2.4s +; CHECK-NEXT: add v2.4s, v2.4s, v3.4s ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: addv s0, v0.4s @@ -316,50 +306,40 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: mov v5.d[1], v1.d[1] ; CHECK-NEXT: mov v7.d[1], v17.d[1] ; CHECK-NEXT: mov v0.d[1], v3.d[1] -; CHECK-NEXT: movi v1.8h, #1 -; CHECK-NEXT: add v2.4s, v7.4s, v5.4s -; CHECK-NEXT: add v3.4s, v0.4s, v4.4s -; CHECK-NEXT: sub v5.4s, v5.4s, v7.4s +; CHECK-NEXT: add v1.4s, v7.4s, v5.4s +; CHECK-NEXT: add v2.4s, v0.4s, v4.4s ; CHECK-NEXT: sub v0.4s, v4.4s, v0.4s -; CHECK-NEXT: ext v4.16b, v3.16b, v3.16b, #4 -; CHECK-NEXT: zip2 v6.4s, v0.4s, v3.4s -; CHECK-NEXT: zip2 v7.4s, v5.4s, v2.4s -; CHECK-NEXT: zip1 v16.4s, v2.4s, v5.4s -; CHECK-NEXT: zip2 v17.4s, v2.4s, v5.4s -; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #4 -; CHECK-NEXT: add v6.4s, v6.4s, v7.4s -; CHECK-NEXT: zip2 v7.4s, v3.4s, v0.4s -; CHECK-NEXT: zip1 v3.4s, v3.4s, v0.4s +; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: ext v16.16b, v1.16b, v1.16b, #4 +; CHECK-NEXT: sub v3.4s, v5.4s, v7.4s +; CHECK-NEXT: zip2 v5.4s, v0.4s, v2.4s +; CHECK-NEXT: zip1 v6.4s, v1.4s, v3.4s +; CHECK-NEXT: zip2 v7.4s, v1.4s, v3.4s +; CHECK-NEXT: zip2 v1.4s, v3.4s, v1.4s +; CHECK-NEXT: zip1 v17.4s, v2.4s, v0.4s +; CHECK-NEXT: zip2 v2.4s, v2.4s, v0.4s ; CHECK-NEXT: ext v0.16b, v4.16b, v0.16b, #8 -; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8 -; CHECK-NEXT: sub v7.4s, v17.4s, v7.4s -; CHECK-NEXT: sub v3.4s, v16.4s, v3.4s +; CHECK-NEXT: ext v3.16b, v16.16b, v3.16b, #8 +; CHECK-NEXT: add v1.4s, v5.4s, v1.4s +; CHECK-NEXT: sub v5.4s, v6.4s, v17.4s ; CHECK-NEXT: ext v0.16b, v0.16b, v4.16b, #4 -; CHECK-NEXT: ext v2.16b, v5.16b, v2.16b, #4 -; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff -; CHECK-NEXT: ushr v5.4s, v3.4s, #15 -; CHECK-NEXT: ushr v4.4s, v6.4s, #15 -; CHECK-NEXT: ushr v16.4s, v7.4s, #15 -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NEXT: and v5.16b, v5.16b, v1.16b -; CHECK-NEXT: mul v2.4s, v5.4s, v17.4s -; CHECK-NEXT: ushr v5.4s, v0.4s, #15 -; CHECK-NEXT: and v4.16b, v4.16b, v1.16b -; CHECK-NEXT: and v16.16b, v16.16b, v1.16b -; CHECK-NEXT: and v1.16b, v5.16b, v1.16b -; CHECK-NEXT: mul v4.4s, v4.4s, v17.4s -; CHECK-NEXT: mul v16.4s, v16.4s, v17.4s -; CHECK-NEXT: mul v1.4s, v1.4s, v17.4s -; CHECK-NEXT: add v3.4s, v2.4s, v3.4s -; CHECK-NEXT: add v5.4s, v4.4s, v6.4s -; CHECK-NEXT: add v6.4s, v16.4s, v7.4s -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: eor v2.16b, v3.16b, v2.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: eor v1.16b, v6.16b, v16.16b -; CHECK-NEXT: eor v3.16b, v5.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ext v3.16b, v3.16b, v16.16b, #4 +; CHECK-NEXT: cmlt v6.8h, v5.8h, #0 +; CHECK-NEXT: sub v2.4s, v7.4s, v2.4s +; CHECK-NEXT: add v4.4s, v6.4s, v5.4s +; CHECK-NEXT: add v0.4s, v0.4s, v3.4s +; CHECK-NEXT: cmlt v7.8h, v2.8h, #0 +; CHECK-NEXT: cmlt v17.8h, v1.8h, #0 +; CHECK-NEXT: eor v3.16b, v4.16b, v6.16b +; CHECK-NEXT: cmlt v4.8h, v0.8h, #0 +; CHECK-NEXT: add v1.4s, v17.4s, v1.4s +; CHECK-NEXT: add v2.4s, v7.4s, v2.4s +; CHECK-NEXT: add v0.4s, v4.4s, v0.4s +; CHECK-NEXT: eor v2.16b, v2.16b, v7.16b +; CHECK-NEXT: eor v1.16b, v1.16b, v17.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v4.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: add v0.4s, v0.4s, v3.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 @@ -557,25 +537,15 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: ext v2.16b, v6.16b, v2.16b, #8 ; CHECK-NEXT: ext v0.16b, v3.16b, v0.16b, #8 ; CHECK-NEXT: add v1.4s, v16.4s, v1.4s -; CHECK-NEXT: movi v16.8h, #1 +; CHECK-NEXT: sub v4.4s, v4.4s, v17.4s ; CHECK-NEXT: ext v2.16b, v2.16b, v6.16b, #4 ; CHECK-NEXT: ext v0.16b, v0.16b, v3.16b, #4 ; CHECK-NEXT: sub v3.4s, v5.4s, v7.4s -; CHECK-NEXT: sub v4.4s, v4.4s, v17.4s -; CHECK-NEXT: ushr v5.4s, v1.4s, #15 +; CHECK-NEXT: cmlt v5.8h, v4.8h, #0 +; CHECK-NEXT: cmlt v6.8h, v3.8h, #0 ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ushr v6.4s, v3.4s, #15 -; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v2.16b, v5.16b, v16.16b -; CHECK-NEXT: ushr v5.4s, v4.4s, #15 -; CHECK-NEXT: ushr v7.4s, v0.4s, #15 -; CHECK-NEXT: and v6.16b, v6.16b, v16.16b -; CHECK-NEXT: and v7.16b, v7.16b, v16.16b -; CHECK-NEXT: and v5.16b, v5.16b, v16.16b -; CHECK-NEXT: mul v2.4s, v2.4s, v17.4s -; CHECK-NEXT: mul v6.4s, v6.4s, v17.4s -; CHECK-NEXT: mul v5.4s, v5.4s, v17.4s -; CHECK-NEXT: mul v7.4s, v7.4s, v17.4s +; CHECK-NEXT: cmlt v2.8h, v1.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v0.8h, #0 ; CHECK-NEXT: add v1.4s, v2.4s, v1.4s ; CHECK-NEXT: add v3.4s, v6.4s, v3.4s ; CHECK-NEXT: add v4.4s, v5.4s, v4.4s -- 2.7.4