From: David Green Date: Thu, 3 Feb 2022 11:05:48 +0000 (+0000) Subject: [AArch64] Reassociate integer extending reductions to pairwise addition. X-Git-Tag: upstream/15.0.7~17956 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=31373fb88a0a3464013e6ebc5773af27a0603275;p=platform%2Fupstream%2Fllvm.git [AArch64] Reassociate integer extending reductions to pairwise addition. Given an (integer) vecreduce, we know the order of the inputs does not matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x)))) into UADDV(UADDLP(x)). This can also happen through an extra add, where we transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))). This makes sure the same thing happens signed cases too, which requires adding a new SADDLP node. Differential Revision: https://reviews.llvm.org/D118107 --- diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 638bbd7..c48f5c4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2254,6 +2254,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::INDEX_VECTOR) + MAKE_CASE(AArch64ISD::SADDLP) MAKE_CASE(AArch64ISD::UADDLP) MAKE_CASE(AArch64ISD::CALL_RVMARKER) MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL) @@ -4378,8 +4379,11 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } + case Intrinsic::aarch64_neon_saddlp: case Intrinsic::aarch64_neon_uaddlp: { - unsigned Opcode = AArch64ISD::UADDLP; + unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp + ? AArch64ISD::UADDLP + : AArch64ISD::SADDLP; return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1)); } case Intrinsic::aarch64_neon_sdot: @@ -13196,6 +13200,61 @@ static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot); } +// Given an (integer) vecreduce, we know the order of the inputs does not +// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x)))) +// into UADDV(UADDLP(x)). This can also happen through an extra add, where we +// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))). +static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) { + auto DetectAddExtract = [&](SDValue A) { + // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning + // UADDLP(x) if found. + if (A.getOpcode() != ISD::ADD) + return SDValue(); + EVT VT = A.getValueType(); + SDValue Op0 = A.getOperand(0); + SDValue Op1 = A.getOperand(1); + if (Op0.getOpcode() != Op0.getOpcode() || + (Op0.getOpcode() != ISD::ZERO_EXTEND && + Op0.getOpcode() != ISD::SIGN_EXTEND)) + return SDValue(); + SDValue Ext0 = Op0.getOperand(0); + SDValue Ext1 = Op1.getOperand(0); + if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR || + Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR || + Ext0.getOperand(0) != Ext1.getOperand(0)) + return SDValue(); + // Check that the type is twice the add types, and the extract are from + // upper/lower parts of the same source. + if (Ext0.getOperand(0).getValueType().getVectorNumElements() != + VT.getVectorNumElements() * 2) + return SDValue(); + if ((Ext0.getConstantOperandVal(1) != 0 && + Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) && + (Ext1.getConstantOperandVal(1) != 0 && + Ext0.getConstantOperandVal(1) != VT.getVectorNumElements())) + return SDValue(); + unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP + : AArch64ISD::SADDLP; + return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0)); + }; + + SDValue A = N->getOperand(0); + if (SDValue R = DetectAddExtract(A)) + return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R); + if (A.getOpcode() == ISD::ADD) { + if (SDValue R = DetectAddExtract(A.getOperand(0))) + return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), + DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R, + A.getOperand(1))); + if (SDValue R = DetectAddExtract(A.getOperand(1))) + return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), + DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R, + A.getOperand(0))); + } + return SDValue(); +} + + static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { @@ -14722,7 +14781,7 @@ static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { } // ADD(UADDV a, UADDV b) --> UADDV(ADD a, b) -static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); // Only scalar integer and vector types. if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger()) @@ -14838,7 +14897,7 @@ static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { // Try to change sum of two reductions. - if (SDValue Val = performUADDVCombine(N, DAG)) + if (SDValue Val = performAddUADDVCombine(N, DAG)) return Val; if (SDValue Val = performAddDotCombine(N, DAG)) return Val; @@ -17805,6 +17864,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performExtractVectorEltCombine(N, DAG); case ISD::VECREDUCE_ADD: return performVecReduceAddCombine(N, DCI.DAG, Subtarget); + case AArch64ISD::UADDV: + return performUADDVCombine(N, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 03d0030..db77107 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -240,7 +240,8 @@ enum NodeType : unsigned { SRHADD, URHADD, - // Unsigned Add Long Pairwise + // Add Long Pairwise + SADDLP, UADDLP, // udot/sdot instructions diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 83bf89f..8dc91d6 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -643,9 +643,13 @@ def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs), (int_aarch64_neon_sabd node:$lhs, node:$rhs)]>; def AArch64uaddlp_n : SDNode<"AArch64ISD::UADDLP", SDT_AArch64uaddlp>; +def AArch64saddlp_n : SDNode<"AArch64ISD::SADDLP", SDT_AArch64uaddlp>; def AArch64uaddlp : PatFrags<(ops node:$src), [(AArch64uaddlp_n node:$src), (int_aarch64_neon_uaddlp node:$src)]>; +def AArch64saddlp : PatFrags<(ops node:$src), + [(AArch64saddlp_n node:$src), + (int_aarch64_neon_saddlp node:$src)]>; def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>; def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; @@ -4312,8 +4316,8 @@ defm REV16 : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>; defm REV32 : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>; defm REV64 : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>; defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp", - BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >; -defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>; + BinOpFrag<(add node:$LHS, (AArch64saddlp node:$RHS))> >; +defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", AArch64saddlp>; defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>; defm SHLL : SIMDVectorLShiftLongBySizeBHS; defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>; diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll index 32d7beb..36b418b 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll @@ -65,9 +65,7 @@ define i32 @oversized_ADDV_256(i8* noalias nocapture readonly %arg1, i8* noalias ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: uabdl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: ushll v1.4s, v0.4h, #0 -; CHECK-NEXT: uaddw2 v0.4s, v1.4s, v0.8h -; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: uaddlv s0, v0.8h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll index d1f1a39..da48edc 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll @@ -198,9 +198,7 @@ define i16 @uabd16b_rdx(<16 x i8>* %a, <16 x i8>* %b) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: uabd.16b v0, v0, v1 -; CHECK-NEXT: ushll.8h v1, v0, #0 -; CHECK-NEXT: uaddw2.8h v0, v1, v0 -; CHECK-NEXT: addv.8h h0, v0 +; CHECK-NEXT: uaddlv.16b h0, v0 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %aload = load <16 x i8>, <16 x i8>* %a, align 1 @@ -261,9 +259,7 @@ define i32 @uabd8h_rdx(<8 x i16>* %a, <8 x i16>* %b) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: uabd.8h v0, v0, v1 -; CHECK-NEXT: ushll.4s v1, v0, #0 -; CHECK-NEXT: uaddw2.4s v0, v1, v0 -; CHECK-NEXT: addv.4s s0, v0 +; CHECK-NEXT: uaddlv.8h s0, v0 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %aload = load <8 x i16>, <8 x i16>* %a, align 1 @@ -282,9 +278,7 @@ define i32 @sabd8h_rdx(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: sabd8h_rdx: ; CHECK: // %bb.0: ; CHECK-NEXT: sabd.8h v0, v0, v1 -; CHECK-NEXT: ushll.4s v1, v0, #0 -; CHECK-NEXT: uaddw2.4s v0, v1, v0 -; CHECK-NEXT: addv.4s s0, v0 +; CHECK-NEXT: uaddlv.8h s0, v0 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %aext = sext <8 x i16> %a to <8 x i32> @@ -338,9 +332,7 @@ define i64 @uabd4s_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: uabd.4s v0, v0, v1 -; CHECK-NEXT: ushll.2d v1, v0, #0 -; CHECK-NEXT: uaddw2.2d v0, v1, v0 -; CHECK-NEXT: addp.2d d0, v0 +; CHECK-NEXT: uaddlv.4s d0, v0 ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret %aload = load <4 x i32>, <4 x i32>* %a, align 1 @@ -359,9 +351,7 @@ define i64 @sabd4s_rdx(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: sabd4s_rdx: ; CHECK: // %bb.0: ; CHECK-NEXT: sabd.4s v0, v0, v1 -; CHECK-NEXT: ushll.2d v1, v0, #0 -; CHECK-NEXT: uaddw2.2d v0, v1, v0 -; CHECK-NEXT: addp.2d d0, v0 +; CHECK-NEXT: uaddlv.4s d0, v0 ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret %aext = sext <4 x i32> %a to <4 x i64> diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 0607707..273f2e0 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -16,9 +16,7 @@ entry: define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) { ; CHECK-LABEL: add_v4i32_v4i64_zext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v1.2d, v0.2s, #0 -; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: uaddlv d0, v0.4s ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret entry: @@ -30,8 +28,7 @@ entry: define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) { ; CHECK-LABEL: add_v4i32_v4i64_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v1.2d, v0.2s, #0 -; CHECK-NEXT: saddw2 v0.2d, v1.2d, v0.4s +; CHECK-NEXT: saddlp v0.2d, v0.4s ; CHECK-NEXT: addp d0, v0.2d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret @@ -70,9 +67,7 @@ entry: define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) { ; CHECK-LABEL: add_v8i16_v8i32_zext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v1.4s, v0.4h, #0 -; CHECK-NEXT: uaddw2 v0.4s, v1.4s, v0.8h -; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: uaddlv s0, v0.8h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: @@ -84,8 +79,7 @@ entry: define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) { ; CHECK-LABEL: add_v8i16_v8i32_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-NEXT: saddw2 v0.4s, v1.4s, v0.8h +; CHECK-NEXT: saddlp v0.4s, v0.8h ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -170,9 +164,7 @@ define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) { ; CHECK-LABEL: add_v4i16_v4i64_zext: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v1.2d, v0.2s, #0 -; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: uaddlv d0, v0.4s ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret entry: @@ -185,8 +177,7 @@ define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) { ; CHECK-LABEL: add_v4i16_v4i64_sext: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-NEXT: sshll v1.2d, v0.2s, #0 -; CHECK-NEXT: saddw2 v0.2d, v1.2d, v0.4s +; CHECK-NEXT: saddlp v0.2d, v0.4s ; CHECK-NEXT: addp d0, v0.2d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret @@ -282,9 +273,7 @@ define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) { ; CHECK-BASE-LABEL: add_v8i8_v8i32_zext: ; CHECK-BASE: // %bb.0: // %entry ; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-BASE-NEXT: ushll v1.4s, v0.4h, #0 -; CHECK-BASE-NEXT: uaddw2 v0.4s, v1.4s, v0.8h -; CHECK-BASE-NEXT: addv s0, v0.4s +; CHECK-BASE-NEXT: uaddlv s0, v0.8h ; CHECK-BASE-NEXT: fmov w0, s0 ; CHECK-BASE-NEXT: ret ; @@ -306,8 +295,7 @@ define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) { ; CHECK-BASE-LABEL: add_v8i8_v8i32_sext: ; CHECK-BASE: // %bb.0: // %entry ; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-BASE-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-BASE-NEXT: saddw2 v0.4s, v1.4s, v0.8h +; CHECK-BASE-NEXT: saddlp v0.4s, v0.8h ; CHECK-BASE-NEXT: addv s0, v0.4s ; CHECK-BASE-NEXT: fmov w0, s0 ; CHECK-BASE-NEXT: ret @@ -358,8 +346,7 @@ entry: define zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) { ; CHECK-LABEL: add_v16i8_v16i16_zext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v1.8h, v0.8b, #0 -; CHECK-NEXT: uaddw2 v0.8h, v1.8h, v0.16b +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: addv h0, v0.8h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -372,8 +359,7 @@ entry: define signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) { ; CHECK-LABEL: add_v16i8_v16i16_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v1.8h, v0.8b, #0 -; CHECK-NEXT: saddw2 v0.8h, v1.8h, v0.16b +; CHECK-NEXT: saddlp v0.8h, v0.16b ; CHECK-NEXT: addv h0, v0.8h ; CHECK-NEXT: smov w0, v0.h[0] ; CHECK-NEXT: ret @@ -511,9 +497,7 @@ define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: bic v0.4h, #255, lsl #8 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v1.2d, v0.2s, #0 -; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: uaddlv d0, v0.4s ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret entry: @@ -598,9 +582,7 @@ entry: define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) { ; CHECK-LABEL: add_v4i32_v4i64_acc_zext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v1.2d, v0.2s, #0 -; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: uaddlv d0, v0.4s ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: add x0, x8, x0 ; CHECK-NEXT: ret @@ -614,8 +596,7 @@ entry: define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) { ; CHECK-LABEL: add_v4i32_v4i64_acc_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v1.2d, v0.2s, #0 -; CHECK-NEXT: saddw2 v0.2d, v1.2d, v0.4s +; CHECK-NEXT: saddlp v0.2d, v0.4s ; CHECK-NEXT: addp d0, v0.2d ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: add x0, x8, x0 @@ -660,9 +641,7 @@ entry: define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) { ; CHECK-LABEL: add_v8i16_v8i32_acc_zext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v1.4s, v0.4h, #0 -; CHECK-NEXT: uaddw2 v0.4s, v1.4s, v0.8h -; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: uaddlv s0, v0.8h ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w0 ; CHECK-NEXT: ret @@ -676,8 +655,7 @@ entry: define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) { ; CHECK-LABEL: add_v8i16_v8i32_acc_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-NEXT: saddw2 v0.4s, v1.4s, v0.8h +; CHECK-NEXT: saddlp v0.4s, v0.8h ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w0 @@ -775,9 +753,7 @@ define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v4i16_v4i64_acc_zext: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v1.2d, v0.2s, #0 -; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: uaddlv d0, v0.4s ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: add x0, x8, x0 ; CHECK-NEXT: ret @@ -792,8 +768,7 @@ define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v4i16_v4i64_acc_sext: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-NEXT: sshll v1.2d, v0.2s, #0 -; CHECK-NEXT: saddw2 v0.2d, v1.2d, v0.4s +; CHECK-NEXT: saddlp v0.2d, v0.4s ; CHECK-NEXT: addp d0, v0.2d ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: add x0, x8, x0 @@ -901,9 +876,7 @@ define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) { ; CHECK-BASE-LABEL: add_v8i8_v8i32_acc_zext: ; CHECK-BASE: // %bb.0: // %entry ; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-BASE-NEXT: ushll v1.4s, v0.4h, #0 -; CHECK-BASE-NEXT: uaddw2 v0.4s, v1.4s, v0.8h -; CHECK-BASE-NEXT: addv s0, v0.4s +; CHECK-BASE-NEXT: uaddlv s0, v0.8h ; CHECK-BASE-NEXT: fmov w8, s0 ; CHECK-BASE-NEXT: add w0, w8, w0 ; CHECK-BASE-NEXT: ret @@ -928,8 +901,7 @@ define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) { ; CHECK-BASE-LABEL: add_v8i8_v8i32_acc_sext: ; CHECK-BASE: // %bb.0: // %entry ; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-BASE-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-BASE-NEXT: saddw2 v0.4s, v1.4s, v0.8h +; CHECK-BASE-NEXT: saddlp v0.4s, v0.8h ; CHECK-BASE-NEXT: addv s0, v0.4s ; CHECK-BASE-NEXT: fmov w8, s0 ; CHECK-BASE-NEXT: add w0, w8, w0 @@ -987,9 +959,7 @@ entry: define zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) { ; CHECK-LABEL: add_v16i8_v16i16_acc_zext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v1.8h, v0.8b, #0 -; CHECK-NEXT: uaddw2 v0.8h, v1.8h, v0.16b -; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: uaddlv h0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w8, w8, w0 ; CHECK-NEXT: and w0, w8, #0xffff @@ -1004,8 +974,7 @@ entry: define signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) { ; CHECK-LABEL: add_v16i8_v16i16_acc_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v1.8h, v0.8b, #0 -; CHECK-NEXT: saddw2 v0.8h, v1.8h, v0.16b +; CHECK-NEXT: saddlp v0.8h, v0.16b ; CHECK-NEXT: addv h0, v0.8h ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w8, w8, w0 @@ -1163,9 +1132,7 @@ define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: bic v0.4h, #255, lsl #8 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v1.2d, v0.2s, #0 -; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: uaddlv d0, v0.4s ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: add x0, x8, x0 ; CHECK-NEXT: ret @@ -1261,11 +1228,8 @@ entry: define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: add_pair_v4i32_v4i64_zext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ushll v3.2d, v1.2s, #0 -; CHECK-NEXT: uaddw2 v0.2d, v2.2d, v0.4s -; CHECK-NEXT: uaddw2 v1.2d, v3.2d, v1.4s -; CHECK-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-NEXT: uaddlp v0.2d, v0.4s +; CHECK-NEXT: uadalp v0.2d, v1.4s ; CHECK-NEXT: addp d0, v0.2d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret @@ -1281,11 +1245,8 @@ entry: define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: add_pair_v4i32_v4i64_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v2.2d, v0.2s, #0 -; CHECK-NEXT: sshll v3.2d, v1.2s, #0 -; CHECK-NEXT: saddw2 v0.2d, v2.2d, v0.4s -; CHECK-NEXT: saddw2 v1.2d, v3.2d, v1.4s -; CHECK-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-NEXT: saddlp v0.2d, v0.4s +; CHECK-NEXT: sadalp v0.2d, v1.4s ; CHECK-NEXT: addp d0, v0.2d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret @@ -1333,11 +1294,8 @@ entry: define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) { ; CHECK-LABEL: add_pair_v8i16_v8i32_zext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll v3.4s, v1.4h, #0 -; CHECK-NEXT: uaddw2 v0.4s, v2.4s, v0.8h -; CHECK-NEXT: uaddw2 v1.4s, v3.4s, v1.8h -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: uaddlp v0.4s, v0.8h +; CHECK-NEXT: uadalp v0.4s, v1.8h ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -1353,11 +1311,8 @@ entry: define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) { ; CHECK-LABEL: add_pair_v8i16_v8i32_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v2.4s, v0.4h, #0 -; CHECK-NEXT: sshll v3.4s, v1.4h, #0 -; CHECK-NEXT: saddw2 v0.4s, v2.4s, v0.8h -; CHECK-NEXT: saddw2 v1.4s, v3.4s, v1.8h -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: saddlp v0.4s, v0.8h +; CHECK-NEXT: sadalp v0.4s, v1.8h ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -1476,11 +1431,8 @@ define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ushll v3.2d, v1.2s, #0 -; CHECK-NEXT: uaddw2 v0.2d, v2.2d, v0.4s -; CHECK-NEXT: uaddw2 v1.2d, v3.2d, v1.4s -; CHECK-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-NEXT: uaddlp v0.2d, v0.4s +; CHECK-NEXT: uadalp v0.2d, v1.4s ; CHECK-NEXT: addp d0, v0.2d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret @@ -1498,11 +1450,8 @@ define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-NEXT: sshll v1.4s, v1.4h, #0 -; CHECK-NEXT: sshll v2.2d, v0.2s, #0 -; CHECK-NEXT: sshll v3.2d, v1.2s, #0 -; CHECK-NEXT: saddw2 v0.2d, v2.2d, v0.4s -; CHECK-NEXT: saddw2 v1.2d, v3.2d, v1.4s -; CHECK-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-NEXT: saddlp v0.2d, v0.4s +; CHECK-NEXT: sadalp v0.2d, v1.4s ; CHECK-NEXT: addp d0, v0.2d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret @@ -1632,11 +1581,8 @@ define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) { ; CHECK-BASE: // %bb.0: // %entry ; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-BASE-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-BASE-NEXT: ushll v3.4s, v1.4h, #0 -; CHECK-BASE-NEXT: uaddw2 v0.4s, v2.4s, v0.8h -; CHECK-BASE-NEXT: uaddw2 v1.4s, v3.4s, v1.8h -; CHECK-BASE-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-BASE-NEXT: uaddlp v0.4s, v0.8h +; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h ; CHECK-BASE-NEXT: addv s0, v0.4s ; CHECK-BASE-NEXT: fmov w0, s0 ; CHECK-BASE-NEXT: ret @@ -1664,11 +1610,8 @@ define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) { ; CHECK-BASE: // %bb.0: // %entry ; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-BASE-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-BASE-NEXT: sshll v2.4s, v0.4h, #0 -; CHECK-BASE-NEXT: sshll v3.4s, v1.4h, #0 -; CHECK-BASE-NEXT: saddw2 v0.4s, v2.4s, v0.8h -; CHECK-BASE-NEXT: saddw2 v1.4s, v3.4s, v1.8h -; CHECK-BASE-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-BASE-NEXT: saddlp v0.4s, v0.8h +; CHECK-BASE-NEXT: sadalp v0.4s, v1.8h ; CHECK-BASE-NEXT: addv s0, v0.4s ; CHECK-BASE-NEXT: fmov w0, s0 ; CHECK-BASE-NEXT: ret @@ -1733,12 +1676,8 @@ entry: define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_pair_v16i8_v16i16_zext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-NEXT: ushll v3.8h, v1.8b, #0 -; CHECK-NEXT: uaddw2 v0.8h, v2.8h, v0.16b -; CHECK-NEXT: uaddw2 v1.8h, v3.8h, v1.16b -; CHECK-NEXT: addv h0, v0.8h -; CHECK-NEXT: addv h1, v1.8h +; CHECK-NEXT: uaddlv h0, v0.16b +; CHECK-NEXT: uaddlv h1, v1.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: add w8, w8, w9 @@ -1756,10 +1695,8 @@ entry: define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_pair_v16i8_v16i16_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v2.8h, v0.8b, #0 -; CHECK-NEXT: sshll v3.8h, v1.8b, #0 -; CHECK-NEXT: saddw2 v0.8h, v2.8h, v0.16b -; CHECK-NEXT: saddw2 v1.8h, v3.8h, v1.16b +; CHECK-NEXT: saddlp v0.8h, v0.16b +; CHECK-NEXT: saddlp v1.8h, v1.16b ; CHECK-NEXT: addv h0, v0.8h ; CHECK-NEXT: addv h1, v1.8h ; CHECK-NEXT: fmov w8, s0 @@ -1982,11 +1919,8 @@ define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) { ; CHECK-NEXT: bic v1.4h, #255, lsl #8 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ushll v3.2d, v1.2s, #0 -; CHECK-NEXT: uaddw2 v0.2d, v2.2d, v0.4s -; CHECK-NEXT: uaddw2 v1.2d, v3.2d, v1.4s -; CHECK-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-NEXT: uaddlp v0.2d, v0.4s +; CHECK-NEXT: uadalp v0.2d, v1.4s ; CHECK-NEXT: addp d0, v0.2d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret