def AArch64lsr_p : SDNode<"AArch64ISD::SRL_PRED", SDT_AArch64Arith>;
def AArch64mul_p : SDNode<"AArch64ISD::MUL_PRED", SDT_AArch64Arith>;
def AArch64sabd_p : SDNode<"AArch64ISD::ABDS_PRED", SDT_AArch64Arith>;
+def AArch64shadd_p : SDNode<"AArch64ISD::HADDS_PRED", SDT_AArch64Arith>;
+def AArch64srhadd_p : SDNode<"AArch64ISD::RHADDS_PRED", SDT_AArch64Arith>;
def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>;
def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>;
def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>;
def AArch64smulh_p : SDNode<"AArch64ISD::MULHS_PRED", SDT_AArch64Arith>;
def AArch64uabd_p : SDNode<"AArch64ISD::ABDU_PRED", SDT_AArch64Arith>;
+def AArch64uhadd_p : SDNode<"AArch64ISD::HADDU_PRED", SDT_AArch64Arith>;
+def AArch64urhadd_p : SDNode<"AArch64ISD::RHADDU_PRED", SDT_AArch64Arith>;
def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>;
def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>;
def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>;
(AArch64fsub_p (SVEAllActive), node:$op1, (vselect node:$pg, node:$op2, (SVEDup0)))
]>;
+def AArch64shadd : PatFrags<(ops node:$pg, node:$op1, node:$op2),
+ [(int_aarch64_sve_shadd node:$pg, node:$op1, node:$op2),
+ (AArch64shadd_p node:$pg, node:$op1, node:$op2)]>;
+def AArch64uhadd : PatFrags<(ops node:$pg, node:$op1, node:$op2),
+ [(int_aarch64_sve_uhadd node:$pg, node:$op1, node:$op2),
+ (AArch64uhadd_p node:$pg, node:$op1, node:$op2)]>;
+def AArch64srhadd : PatFrags<(ops node:$pg, node:$op1, node:$op2),
+ [(int_aarch64_sve_srhadd node:$pg, node:$op1, node:$op2),
+ (AArch64srhadd_p node:$pg, node:$op1, node:$op2)]>;
+def AArch64urhadd : PatFrags<(ops node:$pg, node:$op1, node:$op2),
+ [(int_aarch64_sve_urhadd node:$pg, node:$op1, node:$op2),
+ (AArch64urhadd_p node:$pg, node:$op1, node:$op2)]>;
+
def AArch64saba : PatFrags<(ops node:$op1, node:$op2, node:$op3),
[(int_aarch64_sve_saba node:$op1, node:$op2, node:$op3),
(add node:$op1, (AArch64sabd_p (SVEAllActive), node:$op2, node:$op3))]>;
defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt", int_aarch64_sve_sqdmlslbt>;
// SVE2 integer halving add/subtract (predicated)
- defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd", int_aarch64_sve_shadd>;
- defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd", int_aarch64_sve_uhadd>;
+ defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd", AArch64shadd>;
+ defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd", AArch64uhadd>;
defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub", int_aarch64_sve_shsub>;
defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub", int_aarch64_sve_uhsub>;
- defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd", int_aarch64_sve_srhadd>;
- defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd", int_aarch64_sve_urhadd>;
+ defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd", AArch64srhadd>;
+ defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd", AArch64urhadd>;
defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr", int_aarch64_sve_shsubr>;
defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr", int_aarch64_sve_uhsubr>;
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple aarch64-none-eabi -mattr=+sve2 -o - | FileCheck %s
+define <vscale x 2 x i64> @hadds_v2i64(<vscale x 2 x i64> %s0, <vscale x 2 x i64> %s1) {
+; CHECK-LABEL: hadds_v2i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: shadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+entry:
+ %s0s = sext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
+ %s1s = sext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
+ %m = add <vscale x 2 x i128> %s0s, %s1s
+ %s = lshr <vscale x 2 x i128> %m, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+ %s2 = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %s2
+}
+
+define <vscale x 2 x i64> @haddu_v2i64(<vscale x 2 x i64> %s0, <vscale x 2 x i64> %s1) {
+; CHECK-LABEL: haddu_v2i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uhadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+entry:
+ %s0s = zext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
+ %s1s = zext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
+ %m = add <vscale x 2 x i128> %s0s, %s1s
+ %s = lshr <vscale x 2 x i128> %m, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+ %s2 = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %s2
+}
+
define <vscale x 2 x i32> @hadds_v2i32(<vscale x 2 x i32> %s0, <vscale x 2 x i32> %s1) {
; CHECK-LABEL: hadds_v2i32:
; CHECK: // %bb.0: // %entry
define <vscale x 4 x i32> @hadds_v4i32(<vscale x 4 x i32> %s0, <vscale x 4 x i32> %s1) {
; CHECK-LABEL: hadds_v4i32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sunpkhi z2.d, z0.s
-; CHECK-NEXT: sunpklo z0.d, z0.s
-; CHECK-NEXT: sunpkhi z3.d, z1.s
-; CHECK-NEXT: sunpklo z1.d, z1.s
-; CHECK-NEXT: add z0.d, z0.d, z1.d
-; CHECK-NEXT: add z1.d, z2.d, z3.d
-; CHECK-NEXT: lsr z1.d, z1.d, #1
-; CHECK-NEXT: lsr z0.d, z0.d, #1
-; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: shadd z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: ret
entry:
%s0s = sext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
define <vscale x 4 x i32> @haddu_v4i32(<vscale x 4 x i32> %s0, <vscale x 4 x i32> %s1) {
; CHECK-LABEL: haddu_v4i32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: uunpkhi z2.d, z0.s
-; CHECK-NEXT: uunpklo z0.d, z0.s
-; CHECK-NEXT: uunpkhi z3.d, z1.s
-; CHECK-NEXT: uunpklo z1.d, z1.s
-; CHECK-NEXT: add z0.d, z0.d, z1.d
-; CHECK-NEXT: add z1.d, z2.d, z3.d
-; CHECK-NEXT: lsr z1.d, z1.d, #1
-; CHECK-NEXT: lsr z0.d, z0.d, #1
-; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: uhadd z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: ret
entry:
%s0s = zext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
define <vscale x 8 x i16> @hadds_v8i16(<vscale x 8 x i16> %s0, <vscale x 8 x i16> %s1) {
; CHECK-LABEL: hadds_v8i16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sunpkhi z2.s, z0.h
-; CHECK-NEXT: sunpklo z0.s, z0.h
-; CHECK-NEXT: sunpkhi z3.s, z1.h
-; CHECK-NEXT: sunpklo z1.s, z1.h
-; CHECK-NEXT: add z0.s, z0.s, z1.s
-; CHECK-NEXT: add z1.s, z2.s, z3.s
-; CHECK-NEXT: lsr z1.s, z1.s, #1
-; CHECK-NEXT: lsr z0.s, z0.s, #1
-; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: shadd z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: ret
entry:
%s0s = sext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
define <vscale x 8 x i16> @haddu_v8i16(<vscale x 8 x i16> %s0, <vscale x 8 x i16> %s1) {
; CHECK-LABEL: haddu_v8i16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: uunpkhi z2.s, z0.h
-; CHECK-NEXT: uunpklo z0.s, z0.h
-; CHECK-NEXT: uunpkhi z3.s, z1.h
-; CHECK-NEXT: uunpklo z1.s, z1.h
-; CHECK-NEXT: add z0.s, z0.s, z1.s
-; CHECK-NEXT: add z1.s, z2.s, z3.s
-; CHECK-NEXT: lsr z1.s, z1.s, #1
-; CHECK-NEXT: lsr z0.s, z0.s, #1
-; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: uhadd z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: ret
entry:
%s0s = zext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
define <vscale x 16 x i8> @hadds_v16i8(<vscale x 16 x i8> %s0, <vscale x 16 x i8> %s1) {
; CHECK-LABEL: hadds_v16i8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sunpkhi z2.h, z0.b
-; CHECK-NEXT: sunpklo z0.h, z0.b
-; CHECK-NEXT: sunpkhi z3.h, z1.b
-; CHECK-NEXT: sunpklo z1.h, z1.b
-; CHECK-NEXT: add z0.h, z0.h, z1.h
-; CHECK-NEXT: add z1.h, z2.h, z3.h
-; CHECK-NEXT: lsr z1.h, z1.h, #1
-; CHECK-NEXT: lsr z0.h, z0.h, #1
-; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: shadd z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: ret
entry:
%s0s = sext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
define <vscale x 16 x i8> @haddu_v16i8(<vscale x 16 x i8> %s0, <vscale x 16 x i8> %s1) {
; CHECK-LABEL: haddu_v16i8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: uunpkhi z2.h, z0.b
-; CHECK-NEXT: uunpklo z0.h, z0.b
-; CHECK-NEXT: uunpkhi z3.h, z1.b
-; CHECK-NEXT: uunpklo z1.h, z1.b
-; CHECK-NEXT: add z0.h, z0.h, z1.h
-; CHECK-NEXT: add z1.h, z2.h, z3.h
-; CHECK-NEXT: lsr z1.h, z1.h, #1
-; CHECK-NEXT: lsr z0.h, z0.h, #1
-; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: uhadd z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: ret
entry:
%s0s = zext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
ret <vscale x 16 x i8> %s2
}
+define <vscale x 2 x i64> @rhadds_v2i64(<vscale x 2 x i64> %s0, <vscale x 2 x i64> %s1) {
+; CHECK-LABEL: rhadds_v2i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: srhadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+entry:
+ %s0s = sext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
+ %s1s = sext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
+ %add = add <vscale x 2 x i128> %s0s, %s1s
+ %add2 = add <vscale x 2 x i128> %add, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+ %s = lshr <vscale x 2 x i128> %add2, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+ %result = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %result
+}
+
+define <vscale x 2 x i64> @rhaddu_v2i64(<vscale x 2 x i64> %s0, <vscale x 2 x i64> %s1) {
+; CHECK-LABEL: rhaddu_v2i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: urhadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+entry:
+ %s0s = zext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
+ %s1s = zext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
+ %add = add <vscale x 2 x i128> %s0s, %s1s
+ %add2 = add <vscale x 2 x i128> %add, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+ %s = lshr <vscale x 2 x i128> %add2, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+ %result = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %result
+}
+
define <vscale x 2 x i32> @rhadds_v2i32(<vscale x 2 x i32> %s0, <vscale x 2 x i32> %s1) {
; CHECK-LABEL: rhadds_v2i32:
; CHECK: // %bb.0: // %entry
define <vscale x 4 x i32> @rhadds_v4i32(<vscale x 4 x i32> %s0, <vscale x 4 x i32> %s1) {
; CHECK-LABEL: rhadds_v4i32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov z2.d, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: sunpkhi z3.d, z0.s
-; CHECK-NEXT: sunpklo z0.d, z0.s
-; CHECK-NEXT: sunpkhi z4.d, z1.s
-; CHECK-NEXT: sunpklo z1.d, z1.s
-; CHECK-NEXT: eor z0.d, z0.d, z2.d
-; CHECK-NEXT: eor z2.d, z3.d, z2.d
-; CHECK-NEXT: sub z0.d, z1.d, z0.d
-; CHECK-NEXT: sub z1.d, z4.d, z2.d
-; CHECK-NEXT: lsr z0.d, z0.d, #1
-; CHECK-NEXT: lsr z1.d, z1.d, #1
-; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: srhadd z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: ret
entry:
%s0s = sext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
define <vscale x 4 x i32> @rhaddu_v4i32(<vscale x 4 x i32> %s0, <vscale x 4 x i32> %s1) {
; CHECK-LABEL: rhaddu_v4i32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov z2.d, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: uunpkhi z3.d, z0.s
-; CHECK-NEXT: uunpklo z0.d, z0.s
-; CHECK-NEXT: uunpkhi z4.d, z1.s
-; CHECK-NEXT: uunpklo z1.d, z1.s
-; CHECK-NEXT: eor z0.d, z0.d, z2.d
-; CHECK-NEXT: eor z2.d, z3.d, z2.d
-; CHECK-NEXT: sub z0.d, z1.d, z0.d
-; CHECK-NEXT: sub z1.d, z4.d, z2.d
-; CHECK-NEXT: lsr z0.d, z0.d, #1
-; CHECK-NEXT: lsr z1.d, z1.d, #1
-; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: urhadd z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: ret
entry:
%s0s = zext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
define <vscale x 8 x i16> @rhadds_v8i16(<vscale x 8 x i16> %s0, <vscale x 8 x i16> %s1) {
; CHECK-LABEL: rhadds_v8i16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: sunpkhi z3.s, z0.h
-; CHECK-NEXT: sunpklo z0.s, z0.h
-; CHECK-NEXT: sunpkhi z4.s, z1.h
-; CHECK-NEXT: sunpklo z1.s, z1.h
-; CHECK-NEXT: eor z0.d, z0.d, z2.d
-; CHECK-NEXT: eor z2.d, z3.d, z2.d
-; CHECK-NEXT: sub z0.s, z1.s, z0.s
-; CHECK-NEXT: sub z1.s, z4.s, z2.s
-; CHECK-NEXT: lsr z0.s, z0.s, #1
-; CHECK-NEXT: lsr z1.s, z1.s, #1
-; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: srhadd z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: ret
entry:
%s0s = sext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
define <vscale x 8 x i16> @rhaddu_v8i16(<vscale x 8 x i16> %s0, <vscale x 8 x i16> %s1) {
; CHECK-LABEL: rhaddu_v8i16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: uunpkhi z3.s, z0.h
-; CHECK-NEXT: uunpklo z0.s, z0.h
-; CHECK-NEXT: uunpkhi z4.s, z1.h
-; CHECK-NEXT: uunpklo z1.s, z1.h
-; CHECK-NEXT: eor z0.d, z0.d, z2.d
-; CHECK-NEXT: eor z2.d, z3.d, z2.d
-; CHECK-NEXT: sub z0.s, z1.s, z0.s
-; CHECK-NEXT: sub z1.s, z4.s, z2.s
-; CHECK-NEXT: lsr z0.s, z0.s, #1
-; CHECK-NEXT: lsr z1.s, z1.s, #1
-; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: urhadd z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: ret
entry:
%s0s = zext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
define <vscale x 16 x i8> @rhadds_v16i8(<vscale x 16 x i8> %s0, <vscale x 16 x i8> %s1) {
; CHECK-LABEL: rhadds_v16i8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov z2.h, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: sunpkhi z3.h, z0.b
-; CHECK-NEXT: sunpklo z0.h, z0.b
-; CHECK-NEXT: sunpkhi z4.h, z1.b
-; CHECK-NEXT: sunpklo z1.h, z1.b
-; CHECK-NEXT: eor z0.d, z0.d, z2.d
-; CHECK-NEXT: eor z2.d, z3.d, z2.d
-; CHECK-NEXT: sub z0.h, z1.h, z0.h
-; CHECK-NEXT: sub z1.h, z4.h, z2.h
-; CHECK-NEXT: lsr z0.h, z0.h, #1
-; CHECK-NEXT: lsr z1.h, z1.h, #1
-; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: srhadd z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: ret
entry:
%s0s = sext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
define <vscale x 16 x i8> @rhaddu_v16i8(<vscale x 16 x i8> %s0, <vscale x 16 x i8> %s1) {
; CHECK-LABEL: rhaddu_v16i8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov z2.h, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: uunpkhi z3.h, z0.b
-; CHECK-NEXT: uunpklo z0.h, z0.b
-; CHECK-NEXT: uunpkhi z4.h, z1.b
-; CHECK-NEXT: uunpklo z1.h, z1.b
-; CHECK-NEXT: eor z0.d, z0.d, z2.d
-; CHECK-NEXT: eor z2.d, z3.d, z2.d
-; CHECK-NEXT: sub z0.h, z1.h, z0.h
-; CHECK-NEXT: sub z1.h, z4.h, z2.h
-; CHECK-NEXT: lsr z0.h, z0.h, #1
-; CHECK-NEXT: lsr z1.h, z1.h, #1
-; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: urhadd z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: ret
entry:
%s0s = zext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>