From b8b075d8d744a84de60171c6395fc7c6190d21ca Mon Sep 17 00:00:00 2001 From: Bradley Smith Date: Tue, 13 Apr 2021 15:19:59 +0100 Subject: [PATCH] [AArch64][SVE] Lower MULHU/MULHS nodes to umulh/smulh instructions Mark MULHS/MULHU nodes as legal for both scalable and fixed SVE types, and lower them to the appropriate SVE instructions. Additionally now that the MULH nodes are legal, integer divides can be expanded into a more performant code sequence. Differential Revision: https://reviews.llvm.org/D100487 --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 18 +- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 + llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 8 +- llvm/test/CodeGen/AArch64/sve-expand-div.ll | 144 +++ .../CodeGen/AArch64/sve-fixed-length-int-div.ll | 3 +- .../CodeGen/AArch64/sve-fixed-length-int-mulh.ll | 1006 ++++++++++++++++++++ llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll | 6 +- llvm/test/CodeGen/AArch64/sve-int-mulh-pred.ll | 140 +++ llvm/test/CodeGen/AArch64/sve2-int-mulh.ll | 132 +++ 9 files changed, 1452 insertions(+), 7 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve-expand-div.ll create mode 100644 llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll create mode 100644 llvm/test/CodeGen/AArch64/sve-int-mulh-pred.ll create mode 100644 llvm/test/CodeGen/AArch64/sve2-int-mulh.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 5b94a77..22e377e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1126,6 +1126,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::MULHS, VT, Custom); + setOperationAction(ISD::MULHU, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); @@ -1149,8 +1151,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); setOperationAction(ISD::STEP_VECTOR, VT, Custom); - setOperationAction(ISD::MULHU, VT, Expand); - setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); } @@ -1259,6 +1259,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); setOperationAction(ISD::MUL, MVT::v1i64, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); + setOperationAction(ISD::MULHS, MVT::v1i64, Custom); + setOperationAction(ISD::MULHS, MVT::v2i64, Custom); + setOperationAction(ISD::MULHU, MVT::v1i64, Custom); + setOperationAction(ISD::MULHU, MVT::v2i64, Custom); setOperationAction(ISD::SDIV, MVT::v8i8, Custom); setOperationAction(ISD::SDIV, MVT::v16i8, Custom); setOperationAction(ISD::SDIV, MVT::v4i16, Custom); @@ -1453,6 +1457,8 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::FTRUNC, VT, Custom); setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::MULHS, VT, Custom); + setOperationAction(ISD::MULHU, VT, Custom); setOperationAction(ISD::OR, VT, Custom); setOperationAction(ISD::SDIV, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); @@ -1799,6 +1805,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) MAKE_CASE(AArch64ISD::ADD_PRED) MAKE_CASE(AArch64ISD::MUL_PRED) + MAKE_CASE(AArch64ISD::MULHS_PRED) + MAKE_CASE(AArch64ISD::MULHU_PRED) MAKE_CASE(AArch64ISD::SDIV_PRED) MAKE_CASE(AArch64ISD::SHL_PRED) MAKE_CASE(AArch64ISD::SMAX_PRED) @@ -4519,6 +4527,12 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerSET_ROUNDING(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::MULHS: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED, + /*OverrideNEON=*/true); + case ISD::MULHU: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED, + /*OverrideNEON=*/true); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::STORE: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 63df223..a5530c8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -89,6 +89,8 @@ enum NodeType : unsigned { FMUL_PRED, FSUB_PRED, MUL_PRED, + MULHS_PRED, + MULHU_PRED, SDIV_PRED, SHL_PRED, SMAX_PRED, diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 1bd9b1f..c2d4644 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -192,10 +192,12 @@ def AArch64mul_p : SDNode<"AArch64ISD::MUL_PRED", SDT_AArch64Arith>; def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>; def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>; def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>; +def AArch64smulh_p : SDNode<"AArch64ISD::MULHS_PRED", SDT_AArch64Arith>; def AArch64sub_p : SDNode<"AArch64ISD::SUB_PRED", SDT_AArch64Arith>; def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>; def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>; def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>; +def AArch64umulh_p : SDNode<"AArch64ISD::MULHU_PRED", SDT_AArch64Arith>; def SDT_AArch64IntExtend : SDTypeProfile<1, 4, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3, OtherVT>, SDTCisVec<4>, @@ -348,6 +350,8 @@ let Predicates = [HasSVE] in { defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", "UMULH_ZPZZ", int_aarch64_sve_umulh, DestructiveBinaryComm>; defm MUL_ZPZZ : sve_int_bin_pred_bhsd; + defm SMULH_ZPZZ : sve_int_bin_pred_bhsd; + defm UMULH_ZPZZ : sve_int_bin_pred_bhsd; defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv", "SDIV_ZPZZ", int_aarch64_sve_sdiv, DestructiveBinaryCommWithRev, "SDIVR_ZPmZ">; defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv", "UDIV_ZPZZ", int_aarch64_sve_udiv, DestructiveBinaryCommWithRev, "UDIVR_ZPmZ">; @@ -2385,8 +2389,8 @@ let Predicates = [HasSVE2] in { // SVE2 integer multiply vectors (unpredicated) defm MUL_ZZZ : sve2_int_mul<0b000, "mul", null_frag, AArch64mul_p>; - defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh", null_frag>; - defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh", null_frag>; + defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh", null_frag, AArch64smulh_p>; + defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh", null_frag, AArch64umulh_p>; defm PMUL_ZZZ : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>; // Add patterns for unpredicated version of smulh and umulh. diff --git a/llvm/test/CodeGen/AArch64/sve-expand-div.ll b/llvm/test/CodeGen/AArch64/sve-expand-div.ll new file mode 100644 index 0000000..185054a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-expand-div.ll @@ -0,0 +1,144 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s + +; Check that expensive divides are expanded into a more performant sequence + +; +; SDIV +; + +define @sdiv_i8( %a) #0 { +; CHECK-LABEL: sdiv_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.b, #86 // =0x56 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: lsr z1.b, z0.b, #7 +; CHECK-NEXT: mov z2.b, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: add z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %div = sdiv %a, shufflevector ( insertelement ( undef, i8 3, i32 0), undef, zeroinitializer) + ret %div +} + +define @sdiv_i16( %a) #0 { +; CHECK-LABEL: sdiv_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #21846 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: lsr z1.h, z0.h, #15 +; CHECK-NEXT: mov z2.h, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %div = sdiv %a, shufflevector ( insertelement ( undef, i16 3, i32 0), undef, zeroinitializer) + ret %div +} + +define @sdiv_i32( %a) #0 { +; CHECK-LABEL: sdiv_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #21846 +; CHECK-NEXT: movk w8, #21845, lsl #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: lsr z1.s, z0.s, #31 +; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %div = sdiv %a, shufflevector ( insertelement ( undef, i32 3, i32 0), undef, zeroinitializer) + ret %div +} + +define @sdiv_i64( %a) #0 { +; CHECK-LABEL: sdiv_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #6148914691236517205 +; CHECK-NEXT: movk x8, #21846 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: lsr z1.d, z0.d, #63 +; CHECK-NEXT: mov z2.d, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %div = sdiv %a, shufflevector ( insertelement ( undef, i64 3, i32 0), undef, zeroinitializer) + ret %div +} + +; +; UDIV +; + +define @udiv_i8( %a) #0 { +; CHECK-LABEL: udiv_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.b, #-85 // =0xffffffffffffffab +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z2.b, #1 // =0x1 +; CHECK-NEXT: umulh z1.b, p0/m, z1.b, z0.b +; CHECK-NEXT: lsr z1.b, z1.b, #1 +; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, #3 +; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b +; CHECK-NEXT: ret + %div = udiv %a, shufflevector ( insertelement ( undef, i8 3, i32 0), undef, zeroinitializer) + ret %div +} + +define @udiv_i16( %a) #0 { +; CHECK-LABEL: udiv_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #-21845 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: mov z1.h, #1 // =0x1 +; CHECK-NEXT: umulh z2.h, p0/m, z2.h, z0.h +; CHECK-NEXT: lsr z2.h, z2.h, #1 +; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, #3 +; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h +; CHECK-NEXT: ret + %div = udiv %a, shufflevector ( insertelement ( undef, i16 3, i32 0), undef, zeroinitializer) + ret %div +} + +define @udiv_i32( %a) #0 { +; CHECK-LABEL: udiv_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: movk w8, #43690, lsl #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: mov z1.s, #3 // =0x3 +; CHECK-NEXT: umulh z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: lsr z2.s, z2.s, #1 +; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, #1 +; CHECK-NEXT: sel z0.s, p0, z0.s, z2.s +; CHECK-NEXT: ret + %div = udiv %a, shufflevector ( insertelement ( undef, i32 3, i32 0), undef, zeroinitializer) + ret %div +} + +define @udiv_i64( %a) #0 { +; CHECK-LABEL: udiv_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #-6148914691236517206 +; CHECK-NEXT: movk x8, #43691 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: mov z1.d, #3 // =0x3 +; CHECK-NEXT: umulh z2.d, p0/m, z2.d, z0.d +; CHECK-NEXT: lsr z2.d, z2.d, #1 +; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, #1 +; CHECK-NEXT: sel z0.d, p0, z0.d, z2.d +; CHECK-NEXT: ret + %div = udiv %a, shufflevector ( insertelement ( undef, i64 3, i32 0), undef, zeroinitializer) + ret %div +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll index b702282..1fe10c2 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll @@ -970,7 +970,7 @@ define void @udiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { ; This used to crash because isUnaryPredicate and BuildUDIV don't know how ; a SPLAT_VECTOR of fixed vector type should be handled. -define void @udiv_constantsplat_v8i32(<8 x i32>* %a) #0 { +define void @udiv_constantsplat_v8i32(<8 x i32>* %a) #1 { ; CHECK-LABEL: udiv_constantsplat_v8i32: ; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] ; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] @@ -985,3 +985,4 @@ define void @udiv_constantsplat_v8i32(<8 x i32>* %a) #0 { } attributes #0 = { "target-features"="+sve" } +attributes #1 = { "target-features"="+sve" minsize } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll new file mode 100644 index 0000000..48bae1b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll @@ -0,0 +1,1006 @@ +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256 + +; VBYTES represents the useful byte size of a vector register from the code +; generator's point of view. It is clamped to power-of-2 values because +; only power-of-2 vector lengths are considered legal, regardless of the +; user specified vector length. + +; This test only tests the legal types for a given vector width, as mulh nodes +; do not get generated for non-legal types. + +target triple = "aarch64-unknown-linux-gnu" + +; Don't use SVE when its registers are no bigger than NEON. +; NO_SVE-NOT: ptrue + +; +; SMULH +; + +; Don't use SVE for 64-bit vectors. +define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: smulh_v8i8: +; CHECK: smull v0.8h, v0.8b, v1.8b +; CHECK: ushr v1.8h, v0.8h, #8 +; CHECK: umov w8, v1.h[0] +; CHECK: fmov s0, w8 +; CHECK: umov w8, v1.h[1] +; CHECK: mov v0.b[1], w8 +; CHECK: umov w8, v1.h[2] +; CHECK: mov v0.b[2], w8 +; CHECK: umov w8, v1.h[3] +; CHECK: mov v0.b[3], w8 +; CHECK: ret + %insert = insertelement <8 x i16> undef, i16 8, i64 0 + %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer + %1 = sext <8 x i8> %op1 to <8 x i16> + %2 = sext <8 x i8> %op2 to <8 x i16> + %mul = mul <8 x i16> %1, %2 + %shr = lshr <8 x i16> %mul, %splat + %res = trunc <8 x i16> %shr to <8 x i8> + ret <8 x i8> %res +} + +; Don't use SVE for 128-bit vectors. +define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: smulh_v16i8: +; CHECK: smull2 v2.8h, v0.16b, v1.16b +; CHECK: smull v0.8h, v0.8b, v1.8b +; CHECK: uzp2 v0.16b, v0.16b, v2.16b +; CHECK: ret + %insert = insertelement <16 x i16> undef, i16 8, i64 0 + %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer + %1 = sext <16 x i8> %op1 to <16 x i16> + %2 = sext <16 x i8> %op2 to <16 x i16> + %mul = mul <16 x i16> %1, %2 + %shr = lshr <16 x i16> %mul, %splat + %res = trunc <16 x i16> %shr to <16 x i8> + ret <16 x i8> %res +} + +define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: smulh_v32i8: +; VBITS_GE_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]] +; VBITS_GE_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_GE_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_GE_256: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; VBITS_GE_256: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_GE_256: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %insert = insertelement <32 x i16> undef, i16 8, i64 0 + %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer + %1 = sext <32 x i8> %op1 to <32 x i16> + %2 = sext <32 x i8> %op2 to <32 x i16> + %mul = mul <32 x i16> %1, %2 + %shr = lshr <32 x i16> %mul, %splat + %res = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; CHECK-LABEL: smulh_v64i8: +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]] +; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_GE_512: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; VBITS_GE_512: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_GE_512: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %insert = insertelement <64 x i16> undef, i16 8, i64 0 + %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer + %1 = sext <64 x i8> %op1 to <64 x i16> + %2 = sext <64 x i8> %op2 to <64 x i16> + %mul = mul <64 x i16> %1, %2 + %shr = lshr <64 x i16> %mul, %splat + %res = trunc <64 x i16> %shr to <64 x i8> + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { +; CHECK-LABEL: smulh_v128i8: +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]] +; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_GE_1024: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; VBITS_GE_1024: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_GE_1024: ret + %op1 = load <128 x i8>, <128 x i8>* %a + %op2 = load <128 x i8>, <128 x i8>* %b + %insert = insertelement <128 x i16> undef, i16 8, i64 0 + %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer + %1 = sext <128 x i8> %op1 to <128 x i16> + %2 = sext <128 x i8> %op2 to <128 x i16> + %mul = mul <128 x i16> %1, %2 + %shr = lshr <128 x i16> %mul, %splat + %res = trunc <128 x i16> %shr to <128 x i8> + store <128 x i8> %res, <128 x i8>* %a + ret void +} + +define void @smulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { +; CHECK-LABEL: smulh_v256i8: +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]] +; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_GE_2048: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; VBITS_GE_2048: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_GE_2048: ret + %op1 = load <256 x i8>, <256 x i8>* %a + %op2 = load <256 x i8>, <256 x i8>* %b + %insert = insertelement <256 x i16> undef, i16 8, i64 0 + %splat = shufflevector <256 x i16> %insert, <256 x i16> undef, <256 x i32> zeroinitializer + %1 = sext <256 x i8> %op1 to <256 x i16> + %2 = sext <256 x i8> %op2 to <256 x i16> + %mul = mul <256 x i16> %1, %2 + %shr = lshr <256 x i16> %mul, %splat + %res = trunc <256 x i16> %shr to <256 x i8> + store <256 x i8> %res, <256 x i8>* %a + ret void +} + +; Don't use SVE for 64-bit vectors. +define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: smulh_v4i16: +; CHECK: smull v0.4s, v0.4h, v1.4h +; CHECK: ushr v0.4s, v0.4s, #16 +; CHECK: mov w8, v0.s[1] +; CHECK: mov w9, v0.s[2] +; CHECK: mov w10, v0.s[3] +; CHECK: mov v0.h[1], w8 +; CHECK: mov v0.h[2], w9 +; CHECK: mov v0.h[3], w10 +; CHECK: ret + %insert = insertelement <4 x i32> undef, i32 16, i64 0 + %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = sext <4 x i16> %op1 to <4 x i32> + %2 = sext <4 x i16> %op2 to <4 x i32> + %mul = mul <4 x i32> %1, %2 + %shr = lshr <4 x i32> %mul, %splat + %res = trunc <4 x i32> %shr to <4 x i16> + ret <4 x i16> %res +} + +; Don't use SVE for 128-bit vectors. +define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: smulh_v8i16: +; CHECK: smull2 v2.4s, v0.8h, v1.8h +; CHECK: smull v0.4s, v0.4h, v1.4h +; CHECK: uzp2 v0.8h, v0.8h, v2.8h +; CHECK: ret + %insert = insertelement <8 x i32> undef, i32 16, i64 0 + %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer + %1 = sext <8 x i16> %op1 to <8 x i32> + %2 = sext <8 x i16> %op2 to <8 x i32> + %mul = mul <8 x i32> %1, %2 + %shr = lshr <8 x i32> %mul, %splat + %res = trunc <8 x i32> %shr to <8 x i16> + ret <8 x i16> %res +} + +define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: smulh_v16i16: +; VBITS_GE_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]] +; VBITS_GE_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_256: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBITS_GE_256: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_256: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %insert = insertelement <16 x i32> undef, i32 16, i64 0 + %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer + %1 = sext <16 x i16> %op1 to <16 x i32> + %2 = sext <16 x i16> %op2 to <16 x i32> + %mul = mul <16 x i32> %1, %2 + %shr = lshr <16 x i32> %mul, %splat + %res = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; CHECK-LABEL: smulh_v32i16: +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]] +; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_512: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBITS_GE_512: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_512: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %insert = insertelement <32 x i32> undef, i32 16, i64 0 + %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer + %1 = sext <32 x i16> %op1 to <32 x i32> + %2 = sext <32 x i16> %op2 to <32 x i32> + %mul = mul <32 x i32> %1, %2 + %shr = lshr <32 x i32> %mul, %splat + %res = trunc <32 x i32> %shr to <32 x i16> + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { +; CHECK-LABEL: smulh_v64i16: +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]] +; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_1024: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBITS_GE_1024: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_1024: ret + %op1 = load <64 x i16>, <64 x i16>* %a + %op2 = load <64 x i16>, <64 x i16>* %b + %insert = insertelement <64 x i32> undef, i32 16, i64 0 + %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer + %1 = sext <64 x i16> %op1 to <64 x i32> + %2 = sext <64 x i16> %op2 to <64 x i32> + %mul = mul <64 x i32> %1, %2 + %shr = lshr <64 x i32> %mul, %splat + %res = trunc <64 x i32> %shr to <64 x i16> + store <64 x i16> %res, <64 x i16>* %a + ret void +} + +define void @smulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { +; CHECK-LABEL: smulh_v128i16: +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]] +; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_2048: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBITS_GE_2048: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_2048: ret + %op1 = load <128 x i16>, <128 x i16>* %a + %op2 = load <128 x i16>, <128 x i16>* %b + %insert = insertelement <128 x i32> undef, i32 16, i64 0 + %splat = shufflevector <128 x i32> %insert, <128 x i32> undef, <128 x i32> zeroinitializer + %1 = sext <128 x i16> %op1 to <128 x i32> + %2 = sext <128 x i16> %op2 to <128 x i32> + %mul = mul <128 x i32> %1, %2 + %shr = lshr <128 x i32> %mul, %splat + %res = trunc <128 x i32> %shr to <128 x i16> + store <128 x i16> %res, <128 x i16>* %a + ret void +} + +; Vector i64 multiplications are not legal for NEON so use SVE when available. +define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: smulh_v2i32: +; CHECK: sshll v0.2d, v0.2s, #0 +; CHECK: sshll v1.2d, v1.2s, #0 +; CHECK: ptrue p0.d, vl2 +; CHECK: mul z0.d, p0/m, z0.d, z1.d +; CHECK: shrn v0.2s, v0.2d, #32 +; CHECK: ret + %insert = insertelement <2 x i64> undef, i64 32, i64 0 + %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer + %1 = sext <2 x i32> %op1 to <2 x i64> + %2 = sext <2 x i32> %op2 to <2 x i64> + %mul = mul <2 x i64> %1, %2 + %shr = lshr <2 x i64> %mul, %splat + %res = trunc <2 x i64> %shr to <2 x i32> + ret <2 x i32> %res +} + +; Don't use SVE for 128-bit vectors. +define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: smulh_v4i32: +; CHECK: smull2 v2.2d, v0.4s, v1.4s +; CHECK: smull v0.2d, v0.2s, v1.2s +; CHECK: uzp2 v0.4s, v0.4s, v2.4s +; CHECK: ret + %insert = insertelement <4 x i64> undef, i64 32, i64 0 + %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer + %1 = sext <4 x i32> %op1 to <4 x i64> + %2 = sext <4 x i32> %op2 to <4 x i64> + %mul = mul <4 x i64> %1, %2 + %shr = lshr <4 x i64> %mul, %splat + %res = trunc <4 x i64> %shr to <4 x i32> + ret <4 x i32> %res +} + +define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: smulh_v8i32: +; VBITS_GE_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]] +; VBITS_GE_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_256: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_GE_256: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_256: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %insert = insertelement <8 x i64> undef, i64 32, i64 0 + %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer + %1 = sext <8 x i32> %op1 to <8 x i64> + %2 = sext <8 x i32> %op2 to <8 x i64> + %mul = mul <8 x i64> %1, %2 + %shr = lshr <8 x i64> %mul, %splat + %res = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; CHECK-LABEL: smulh_v16i32: +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]] +; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_512: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_GE_512: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_512: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %insert = insertelement <16 x i64> undef, i64 32, i64 0 + %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer + %1 = sext <16 x i32> %op1 to <16 x i64> + %2 = sext <16 x i32> %op2 to <16 x i64> + %mul = mul <16 x i64> %1, %2 + %shr = lshr <16 x i64> %mul, %splat + %res = trunc <16 x i64> %shr to <16 x i32> + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { +; CHECK-LABEL: smulh_v32i32: +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]] +; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_1024: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_GE_1024: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_1024: ret + %op1 = load <32 x i32>, <32 x i32>* %a + %op2 = load <32 x i32>, <32 x i32>* %b + %insert = insertelement <32 x i64> undef, i64 32, i64 0 + %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer + %1 = sext <32 x i32> %op1 to <32 x i64> + %2 = sext <32 x i32> %op2 to <32 x i64> + %mul = mul <32 x i64> %1, %2 + %shr = lshr <32 x i64> %mul, %splat + %res = trunc <32 x i64> %shr to <32 x i32> + store <32 x i32> %res, <32 x i32>* %a + ret void +} + +define void @smulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { +; CHECK-LABEL: smulh_v64i32: +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]] +; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_2048: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_GE_2048: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_2048: ret + %op1 = load <64 x i32>, <64 x i32>* %a + %op2 = load <64 x i32>, <64 x i32>* %b + %insert = insertelement <64 x i64> undef, i64 32, i64 0 + %splat = shufflevector <64 x i64> %insert, <64 x i64> undef, <64 x i32> zeroinitializer + %1 = sext <64 x i32> %op1 to <64 x i64> + %2 = sext <64 x i32> %op2 to <64 x i64> + %mul = mul <64 x i64> %1, %2 + %shr = lshr <64 x i64> %mul, %splat + %res = trunc <64 x i64> %shr to <64 x i32> + store <64 x i32> %res, <64 x i32>* %a + ret void +} + +; Vector i64 multiplications are not legal for NEON so use SVE when available. +define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: smulh_v1i64: +; CHECK: ptrue p0.d, vl1 +; CHECK: smulh z0.d, p0/m, z0.d, z1.d +; CHECK: ret + %insert = insertelement <1 x i128> undef, i128 64, i128 0 + %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer + %1 = sext <1 x i64> %op1 to <1 x i128> + %2 = sext <1 x i64> %op2 to <1 x i128> + %mul = mul <1 x i128> %1, %2 + %shr = lshr <1 x i128> %mul, %splat + %res = trunc <1 x i128> %shr to <1 x i64> + ret <1 x i64> %res +} + +; Vector i64 multiplications are not legal for NEON so use SVE when available. +define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: smulh_v2i64: +; CHECK: ptrue p0.d, vl2 +; CHECK: smulh z0.d, p0/m, z0.d, z1.d +; CHECK: ret + %insert = insertelement <2 x i128> undef, i128 64, i128 0 + %splat = shufflevector <2 x i128> %insert, <2 x i128> undef, <2 x i32> zeroinitializer + %1 = sext <2 x i64> %op1 to <2 x i128> + %2 = sext <2 x i64> %op2 to <2 x i128> + %mul = mul <2 x i128> %1, %2 + %shr = lshr <2 x i128> %mul, %splat + %res = trunc <2 x i128> %shr to <2 x i64> + ret <2 x i64> %res +} + +define void @smulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: smulh_v4i64: +; VBITS_GE_256: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,4)]] +; VBITS_GE_256-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_256-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_256: smulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; VBITS_GE_256: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_256: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %insert = insertelement <4 x i128> undef, i128 64, i128 0 + %splat = shufflevector <4 x i128> %insert, <4 x i128> undef, <4 x i32> zeroinitializer + %1 = sext <4 x i64> %op1 to <4 x i128> + %2 = sext <4 x i64> %op2 to <4 x i128> + %mul = mul <4 x i128> %1, %2 + %shr = lshr <4 x i128> %mul, %splat + %res = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @smulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; CHECK-LABEL: smulh_v8i64: +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]] +; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_512: smulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; VBITS_GE_512: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_512: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %insert = insertelement <8 x i128> undef, i128 64, i128 0 + %splat = shufflevector <8 x i128> %insert, <8 x i128> undef, <8 x i32> zeroinitializer + %1 = sext <8 x i64> %op1 to <8 x i128> + %2 = sext <8 x i64> %op2 to <8 x i128> + %mul = mul <8 x i128> %1, %2 + %shr = lshr <8 x i128> %mul, %splat + %res = trunc <8 x i128> %shr to <8 x i64> + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +define void @smulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { +; CHECK-LABEL: smulh_v16i64: +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]] +; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_1024: smulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; VBITS_GE_1024: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_1024: ret + %op1 = load <16 x i64>, <16 x i64>* %a + %op2 = load <16 x i64>, <16 x i64>* %b + %insert = insertelement <16 x i128> undef, i128 64, i128 0 + %splat = shufflevector <16 x i128> %insert, <16 x i128> undef, <16 x i32> zeroinitializer + %1 = sext <16 x i64> %op1 to <16 x i128> + %2 = sext <16 x i64> %op2 to <16 x i128> + %mul = mul <16 x i128> %1, %2 + %shr = lshr <16 x i128> %mul, %splat + %res = trunc <16 x i128> %shr to <16 x i64> + store <16 x i64> %res, <16 x i64>* %a + ret void +} + +define void @smulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { +; CHECK-LABEL: smulh_v32i64: +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]] +; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_2048: smulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; VBITS_GE_2048: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_2048: ret + %op1 = load <32 x i64>, <32 x i64>* %a + %op2 = load <32 x i64>, <32 x i64>* %b + %insert = insertelement <32 x i128> undef, i128 64, i128 0 + %splat = shufflevector <32 x i128> %insert, <32 x i128> undef, <32 x i32> zeroinitializer + %1 = sext <32 x i64> %op1 to <32 x i128> + %2 = sext <32 x i64> %op2 to <32 x i128> + %mul = mul <32 x i128> %1, %2 + %shr = lshr <32 x i128> %mul, %splat + %res = trunc <32 x i128> %shr to <32 x i64> + store <32 x i64> %res, <32 x i64>* %a + ret void +} + +; +; UMULH +; + +; Don't use SVE for 64-bit vectors. +define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: umulh_v8i8: +; CHECK: umull v0.8h, v0.8b, v1.8b +; CHECK: ushr v1.8h, v0.8h, #8 +; CHECK: umov w8, v1.h[0] +; CHECK: fmov s0, w8 +; CHECK: umov w8, v1.h[1] +; CHECK: mov v0.b[1], w8 +; CHECK: umov w8, v1.h[2] +; CHECK: mov v0.b[2], w8 +; CHECK: umov w8, v1.h[3] +; CHECK: mov v0.b[3], w8 +; CHECK: ret + %insert = insertelement <8 x i16> undef, i16 8, i64 0 + %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer + %1 = zext <8 x i8> %op1 to <8 x i16> + %2 = zext <8 x i8> %op2 to <8 x i16> + %mul = mul <8 x i16> %1, %2 + %shr = lshr <8 x i16> %mul, %splat + %res = trunc <8 x i16> %shr to <8 x i8> + ret <8 x i8> %res +} + +; Don't use SVE for 128-bit vectors. +define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: umulh_v16i8: +; CHECK: umull2 v2.8h, v0.16b, v1.16b +; CHECK: umull v0.8h, v0.8b, v1.8b +; CHECK: uzp2 v0.16b, v0.16b, v2.16b +; CHECK: ret + %insert = insertelement <16 x i16> undef, i16 8, i64 0 + %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer + %1 = zext <16 x i8> %op1 to <16 x i16> + %2 = zext <16 x i8> %op2 to <16 x i16> + %mul = mul <16 x i16> %1, %2 + %shr = lshr <16 x i16> %mul, %splat + %res = trunc <16 x i16> %shr to <16 x i8> + ret <16 x i8> %res +} + +define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: umulh_v32i8: +; VBITS_GE_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]] +; VBITS_GE_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_GE_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_GE_256: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; VBITS_GE_256: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_GE_256: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %insert = insertelement <32 x i16> undef, i16 8, i64 0 + %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer + %1 = zext <32 x i8> %op1 to <32 x i16> + %2 = zext <32 x i8> %op2 to <32 x i16> + %mul = mul <32 x i16> %1, %2 + %shr = lshr <32 x i16> %mul, %splat + %res = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; CHECK-LABEL: umulh_v64i8: +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]] +; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_GE_512: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; VBITS_GE_512: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_GE_512: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %insert = insertelement <64 x i16> undef, i16 8, i64 0 + %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer + %1 = zext <64 x i8> %op1 to <64 x i16> + %2 = zext <64 x i8> %op2 to <64 x i16> + %mul = mul <64 x i16> %1, %2 + %shr = lshr <64 x i16> %mul, %splat + %res = trunc <64 x i16> %shr to <64 x i8> + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { +; CHECK-LABEL: umulh_v128i8: +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]] +; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_GE_1024: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; VBITS_GE_1024: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_GE_1024: ret + %op1 = load <128 x i8>, <128 x i8>* %a + %op2 = load <128 x i8>, <128 x i8>* %b + %insert = insertelement <128 x i16> undef, i16 8, i64 0 + %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer + %1 = zext <128 x i8> %op1 to <128 x i16> + %2 = zext <128 x i8> %op2 to <128 x i16> + %mul = mul <128 x i16> %1, %2 + %shr = lshr <128 x i16> %mul, %splat + %res = trunc <128 x i16> %shr to <128 x i8> + store <128 x i8> %res, <128 x i8>* %a + ret void +} + +define void @umulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { +; CHECK-LABEL: umulh_v256i8: +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]] +; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_GE_2048: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; VBITS_GE_2048: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_GE_2048: ret + %op1 = load <256 x i8>, <256 x i8>* %a + %op2 = load <256 x i8>, <256 x i8>* %b + %insert = insertelement <256 x i16> undef, i16 8, i64 0 + %splat = shufflevector <256 x i16> %insert, <256 x i16> undef, <256 x i32> zeroinitializer + %1 = zext <256 x i8> %op1 to <256 x i16> + %2 = zext <256 x i8> %op2 to <256 x i16> + %mul = mul <256 x i16> %1, %2 + %shr = lshr <256 x i16> %mul, %splat + %res = trunc <256 x i16> %shr to <256 x i8> + store <256 x i8> %res, <256 x i8>* %a + ret void +} + +; Don't use SVE for 64-bit vectors. +define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: umulh_v4i16: +; CHECK: umull v0.4s, v0.4h, v1.4h +; CHECK: ushr v0.4s, v0.4s, #16 +; CHECK: mov w8, v0.s[1] +; CHECK: mov w9, v0.s[2] +; CHECK: mov w10, v0.s[3] +; CHECK: mov v0.h[1], w8 +; CHECK: mov v0.h[2], w9 +; CHECK: mov v0.h[3], w10 +; CHECK: ret + %insert = insertelement <4 x i32> undef, i32 16, i64 0 + %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = zext <4 x i16> %op1 to <4 x i32> + %2 = zext <4 x i16> %op2 to <4 x i32> + %mul = mul <4 x i32> %1, %2 + %shr = lshr <4 x i32> %mul, %splat + %res = trunc <4 x i32> %shr to <4 x i16> + ret <4 x i16> %res +} + +; Don't use SVE for 128-bit vectors. +define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: umulh_v8i16: +; CHECK: umull2 v2.4s, v0.8h, v1.8h +; CHECK: umull v0.4s, v0.4h, v1.4h +; CHECK: uzp2 v0.8h, v0.8h, v2.8h +; CHECK: ret + %insert = insertelement <8 x i32> undef, i32 16, i64 0 + %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer + %1 = zext <8 x i16> %op1 to <8 x i32> + %2 = zext <8 x i16> %op2 to <8 x i32> + %mul = mul <8 x i32> %1, %2 + %shr = lshr <8 x i32> %mul, %splat + %res = trunc <8 x i32> %shr to <8 x i16> + ret <8 x i16> %res +} + +define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: umulh_v16i16: +; VBITS_GE_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]] +; VBITS_GE_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_256: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBITS_GE_256: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_256: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %insert = insertelement <16 x i32> undef, i32 16, i64 0 + %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer + %1 = zext <16 x i16> %op1 to <16 x i32> + %2 = zext <16 x i16> %op2 to <16 x i32> + %mul = mul <16 x i32> %1, %2 + %shr = lshr <16 x i32> %mul, %splat + %res = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; CHECK-LABEL: umulh_v32i16: +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]] +; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_512: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBITS_GE_512: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_512: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %insert = insertelement <32 x i32> undef, i32 16, i64 0 + %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer + %1 = zext <32 x i16> %op1 to <32 x i32> + %2 = zext <32 x i16> %op2 to <32 x i32> + %mul = mul <32 x i32> %1, %2 + %shr = lshr <32 x i32> %mul, %splat + %res = trunc <32 x i32> %shr to <32 x i16> + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { +; CHECK-LABEL: umulh_v64i16: +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]] +; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_1024: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBITS_GE_1024: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_1024: ret + %op1 = load <64 x i16>, <64 x i16>* %a + %op2 = load <64 x i16>, <64 x i16>* %b + %insert = insertelement <64 x i32> undef, i32 16, i64 0 + %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer + %1 = zext <64 x i16> %op1 to <64 x i32> + %2 = zext <64 x i16> %op2 to <64 x i32> + %mul = mul <64 x i32> %1, %2 + %shr = lshr <64 x i32> %mul, %splat + %res = trunc <64 x i32> %shr to <64 x i16> + store <64 x i16> %res, <64 x i16>* %a + ret void +} + +define void @umulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { +; CHECK-LABEL: umulh_v128i16: +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]] +; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_2048: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBITS_GE_2048: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_2048: ret + %op1 = load <128 x i16>, <128 x i16>* %a + %op2 = load <128 x i16>, <128 x i16>* %b + %insert = insertelement <128 x i32> undef, i32 16, i64 0 + %splat = shufflevector <128 x i32> %insert, <128 x i32> undef, <128 x i32> zeroinitializer + %1 = zext <128 x i16> %op1 to <128 x i32> + %2 = zext <128 x i16> %op2 to <128 x i32> + %mul = mul <128 x i32> %1, %2 + %shr = lshr <128 x i32> %mul, %splat + %res = trunc <128 x i32> %shr to <128 x i16> + store <128 x i16> %res, <128 x i16>* %a + ret void +} + +; Vector i64 multiplications are not legal for NEON so use SVE when available. +define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: umulh_v2i32: +; CHECK: ushll v0.2d, v0.2s, #0 +; CHECK: ushll v1.2d, v1.2s, #0 +; CHECK: ptrue p0.d, vl2 +; CHECK: mul z0.d, p0/m, z0.d, z1.d +; CHECK: shrn v0.2s, v0.2d, #32 +; CHECK: ret + %insert = insertelement <2 x i64> undef, i64 32, i64 0 + %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer + %1 = zext <2 x i32> %op1 to <2 x i64> + %2 = zext <2 x i32> %op2 to <2 x i64> + %mul = mul <2 x i64> %1, %2 + %shr = lshr <2 x i64> %mul, %splat + %res = trunc <2 x i64> %shr to <2 x i32> + ret <2 x i32> %res +} + +; Don't use SVE for 128-bit vectors. +define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: umulh_v4i32: +; CHECK: umull2 v2.2d, v0.4s, v1.4s +; CHECK: umull v0.2d, v0.2s, v1.2s +; CHECK: uzp2 v0.4s, v0.4s, v2.4s +; CHECK: ret + %insert = insertelement <4 x i64> undef, i64 32, i64 0 + %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer + %1 = zext <4 x i32> %op1 to <4 x i64> + %2 = zext <4 x i32> %op2 to <4 x i64> + %mul = mul <4 x i64> %1, %2 + %shr = lshr <4 x i64> %mul, %splat + %res = trunc <4 x i64> %shr to <4 x i32> + ret <4 x i32> %res +} + +define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: umulh_v8i32: +; VBITS_GE_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]] +; VBITS_GE_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_256: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_GE_256: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_256: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %insert = insertelement <8 x i64> undef, i64 32, i64 0 + %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer + %1 = zext <8 x i32> %op1 to <8 x i64> + %2 = zext <8 x i32> %op2 to <8 x i64> + %mul = mul <8 x i64> %1, %2 + %shr = lshr <8 x i64> %mul, %splat + %res = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; CHECK-LABEL: umulh_v16i32: +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]] +; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_512: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_GE_512: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_512: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %insert = insertelement <16 x i64> undef, i64 32, i64 0 + %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer + %1 = zext <16 x i32> %op1 to <16 x i64> + %2 = zext <16 x i32> %op2 to <16 x i64> + %mul = mul <16 x i64> %1, %2 + %shr = lshr <16 x i64> %mul, %splat + %res = trunc <16 x i64> %shr to <16 x i32> + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { +; CHECK-LABEL: umulh_v32i32: +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]] +; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_1024: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_GE_1024: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_1024: ret + %op1 = load <32 x i32>, <32 x i32>* %a + %op2 = load <32 x i32>, <32 x i32>* %b + %insert = insertelement <32 x i64> undef, i64 32, i64 0 + %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer + %1 = zext <32 x i32> %op1 to <32 x i64> + %2 = zext <32 x i32> %op2 to <32 x i64> + %mul = mul <32 x i64> %1, %2 + %shr = lshr <32 x i64> %mul, %splat + %res = trunc <32 x i64> %shr to <32 x i32> + store <32 x i32> %res, <32 x i32>* %a + ret void +} + +define void @umulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { +; CHECK-LABEL: umulh_v64i32: +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]] +; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_2048: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_GE_2048: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_2048: ret + %op1 = load <64 x i32>, <64 x i32>* %a + %op2 = load <64 x i32>, <64 x i32>* %b + %insert = insertelement <64 x i64> undef, i64 32, i64 0 + %splat = shufflevector <64 x i64> %insert, <64 x i64> undef, <64 x i32> zeroinitializer + %1 = zext <64 x i32> %op1 to <64 x i64> + %2 = zext <64 x i32> %op2 to <64 x i64> + %mul = mul <64 x i64> %1, %2 + %shr = lshr <64 x i64> %mul, %splat + %res = trunc <64 x i64> %shr to <64 x i32> + store <64 x i32> %res, <64 x i32>* %a + ret void +} + +; Vector i64 multiplications are not legal for NEON so use SVE when available. +define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: umulh_v1i64: +; CHECK: ptrue p0.d, vl1 +; CHECK: umulh z0.d, p0/m, z0.d, z1.d +; CHECK: ret + %insert = insertelement <1 x i128> undef, i128 64, i128 0 + %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer + %1 = zext <1 x i64> %op1 to <1 x i128> + %2 = zext <1 x i64> %op2 to <1 x i128> + %mul = mul <1 x i128> %1, %2 + %shr = lshr <1 x i128> %mul, %splat + %res = trunc <1 x i128> %shr to <1 x i64> + ret <1 x i64> %res +} + +; Vector i64 multiplications are not legal for NEON so use SVE when available. +define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: umulh_v2i64: +; CHECK: ptrue p0.d, vl2 +; CHECK: umulh z0.d, p0/m, z0.d, z1.d +; CHECK: ret + %insert = insertelement <2 x i128> undef, i128 64, i128 0 + %splat = shufflevector <2 x i128> %insert, <2 x i128> undef, <2 x i32> zeroinitializer + %1 = zext <2 x i64> %op1 to <2 x i128> + %2 = zext <2 x i64> %op2 to <2 x i128> + %mul = mul <2 x i128> %1, %2 + %shr = lshr <2 x i128> %mul, %splat + %res = trunc <2 x i128> %shr to <2 x i64> + ret <2 x i64> %res +} + +define void @umulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: umulh_v4i64: +; VBITS_GE_256: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,4)]] +; VBITS_GE_256-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_256-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_256: umulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; VBITS_GE_256: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_256: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %insert = insertelement <4 x i128> undef, i128 64, i128 0 + %splat = shufflevector <4 x i128> %insert, <4 x i128> undef, <4 x i32> zeroinitializer + %1 = zext <4 x i64> %op1 to <4 x i128> + %2 = zext <4 x i64> %op2 to <4 x i128> + %mul = mul <4 x i128> %1, %2 + %shr = lshr <4 x i128> %mul, %splat + %res = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @umulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; CHECK-LABEL: umulh_v8i64: +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]] +; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_512: umulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; VBITS_GE_512: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_512: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %insert = insertelement <8 x i128> undef, i128 64, i128 0 + %splat = shufflevector <8 x i128> %insert, <8 x i128> undef, <8 x i32> zeroinitializer + %1 = zext <8 x i64> %op1 to <8 x i128> + %2 = zext <8 x i64> %op2 to <8 x i128> + %mul = mul <8 x i128> %1, %2 + %shr = lshr <8 x i128> %mul, %splat + %res = trunc <8 x i128> %shr to <8 x i64> + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +define void @umulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { +; CHECK-LABEL: umulh_v16i64: +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]] +; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_1024: umulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; VBITS_GE_1024: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_1024: ret + %op1 = load <16 x i64>, <16 x i64>* %a + %op2 = load <16 x i64>, <16 x i64>* %b + %insert = insertelement <16 x i128> undef, i128 64, i128 0 + %splat = shufflevector <16 x i128> %insert, <16 x i128> undef, <16 x i32> zeroinitializer + %1 = zext <16 x i64> %op1 to <16 x i128> + %2 = zext <16 x i64> %op2 to <16 x i128> + %mul = mul <16 x i128> %1, %2 + %shr = lshr <16 x i128> %mul, %splat + %res = trunc <16 x i128> %shr to <16 x i64> + store <16 x i64> %res, <16 x i64>* %a + ret void +} + +define void @umulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { +; CHECK-LABEL: umulh_v32i64: +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]] +; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_2048: umulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; VBITS_GE_2048: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_2048: ret + %op1 = load <32 x i64>, <32 x i64>* %a + %op2 = load <32 x i64>, <32 x i64>* %b + %insert = insertelement <32 x i128> undef, i128 64, i128 0 + %splat = shufflevector <32 x i128> %insert, <32 x i128> undef, <32 x i32> zeroinitializer + %1 = zext <32 x i64> %op1 to <32 x i128> + %2 = zext <32 x i64> %op2 to <32 x i128> + %mul = mul <32 x i128> %1, %2 + %shr = lshr <32 x i128> %mul, %splat + %res = trunc <32 x i128> %shr to <32 x i64> + store <32 x i64> %res, <32 x i64>* %a + ret void +} +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll index fba722e..3b1c332 100644 --- a/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll @@ -776,7 +776,7 @@ define @lsr_i64( %a){ ret %lshr } -define @sdiv_const( %a) { +define @sdiv_const( %a) #0 { ; CHECK-LABEL: sdiv_const: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z1.s, #3 // =0x3 @@ -788,7 +788,7 @@ entry: ret %div } -define @udiv_const( %a) { +define @udiv_const( %a) #0 { ; CHECK-LABEL: udiv_const: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z1.s, #3 // =0x3 @@ -799,3 +799,5 @@ entry: %div = udiv %a, shufflevector ( insertelement ( undef, i32 3, i32 0), undef, zeroinitializer) ret %div } + +attributes #0 = { minsize } diff --git a/llvm/test/CodeGen/AArch64/sve-int-mulh-pred.ll b/llvm/test/CodeGen/AArch64/sve-int-mulh-pred.ll new file mode 100644 index 0000000..c636f11 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-int-mulh-pred.ll @@ -0,0 +1,140 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s + +; +; SMULH +; + +define @smulh_i8( %a, %b) #0 { +; CHECK-LABEL: smulh_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: ret + %insert = insertelement undef, i16 8, i64 0 + %splat = shufflevector %insert, undef, zeroinitializer + %1 = sext %a to + %2 = sext %b to + %mul = mul %1, %2 + %shr = lshr %mul, %splat + %tr = trunc %shr to + ret %tr +} + +define @smulh_i16( %a, %b) #0 { +; CHECK-LABEL: smulh_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %insert = insertelement undef, i32 16, i64 0 + %splat = shufflevector %insert, undef, zeroinitializer + %1 = sext %a to + %2 = sext %b to + %mul = mul %1, %2 + %shr = lshr %mul, %splat + %tr = trunc %shr to + ret %tr +} + +define @smulh_i32( %a, %b) #0 { +; CHECK-LABEL: smulh_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %insert = insertelement undef, i64 32, i64 0 + %splat = shufflevector %insert, undef, zeroinitializer + %1 = sext %a to + %2 = sext %b to + %mul = mul %1, %2 + %shr = lshr %mul, %splat + %tr = trunc %shr to + ret %tr +} + +define @smulh_i64( %a, %b) #0 { +; CHECK-LABEL: smulh_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %insert = insertelement undef, i128 64, i64 0 + %splat = shufflevector %insert, undef, zeroinitializer + %1 = sext %a to + %2 = sext %b to + %mul = mul %1, %2 + %shr = lshr %mul, %splat + %tr = trunc %shr to + ret %tr +} + +; +; UMULH +; + +define @umulh_i8( %a, %b) #0 { +; CHECK-LABEL: umulh_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: ret + %insert = insertelement undef, i16 8, i64 0 + %splat = shufflevector %insert, undef, zeroinitializer + %1 = zext %a to + %2 = zext %b to + %mul = mul %1, %2 + %shr = lshr %mul, %splat + %tr = trunc %shr to + ret %tr +} + +define @umulh_i16( %a, %b) #0 { +; CHECK-LABEL: umulh_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %insert = insertelement undef, i32 16, i64 0 + %splat = shufflevector %insert, undef, zeroinitializer + %1 = zext %a to + %2 = zext %b to + %mul = mul %1, %2 + %shr = lshr %mul, %splat + %tr = trunc %shr to + ret %tr +} + +define @umulh_i32( %a, %b) #0 { +; CHECK-LABEL: umulh_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %insert = insertelement undef, i64 32, i64 0 + %splat = shufflevector %insert, undef, zeroinitializer + %1 = zext %a to + %2 = zext %b to + %mul = mul %1, %2 + %shr = lshr %mul, %splat + %tr = trunc %shr to + ret %tr +} + +define @umulh_i64( %a, %b) #0 { +; CHECK-LABEL: umulh_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %insert = insertelement undef, i128 64, i64 0 + %splat = shufflevector %insert, undef, zeroinitializer + %1 = zext %a to + %2 = zext %b to + %mul = mul %1, %2 + %shr = lshr %mul, %splat + %tr = trunc %shr to + ret %tr +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve2-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve2-int-mulh.ll new file mode 100644 index 0000000..c929a3f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-int-mulh.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s + +; +; SMULH +; + +define @smulh_i8( %a, %b) #0 { +; CHECK-LABEL: smulh_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: smulh z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %insert = insertelement undef, i16 8, i64 0 + %splat = shufflevector %insert, undef, zeroinitializer + %1 = sext %a to + %2 = sext %b to + %mul = mul %1, %2 + %shr = lshr %mul, %splat + %tr = trunc %shr to + ret %tr +} + +define @smulh_i16( %a, %b) #0 { +; CHECK-LABEL: smulh_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: smulh z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %insert = insertelement undef, i32 16, i64 0 + %splat = shufflevector %insert, undef, zeroinitializer + %1 = sext %a to + %2 = sext %b to + %mul = mul %1, %2 + %shr = lshr %mul, %splat + %tr = trunc %shr to + ret %tr +} + +define @smulh_i32( %a, %b) #0 { +; CHECK-LABEL: smulh_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: smulh z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %insert = insertelement undef, i64 32, i64 0 + %splat = shufflevector %insert, undef, zeroinitializer + %1 = sext %a to + %2 = sext %b to + %mul = mul %1, %2 + %shr = lshr %mul, %splat + %tr = trunc %shr to + ret %tr +} + +define @smulh_i64( %a, %b) #0 { +; CHECK-LABEL: smulh_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: smulh z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %insert = insertelement undef, i128 64, i64 0 + %splat = shufflevector %insert, undef, zeroinitializer + %1 = sext %a to + %2 = sext %b to + %mul = mul %1, %2 + %shr = lshr %mul, %splat + %tr = trunc %shr to + ret %tr +} + +; +; UMULH +; + +define @umulh_i8( %a, %b) #0 { +; CHECK-LABEL: umulh_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: umulh z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %insert = insertelement undef, i16 8, i64 0 + %splat = shufflevector %insert, undef, zeroinitializer + %1 = zext %a to + %2 = zext %b to + %mul = mul %1, %2 + %shr = lshr %mul, %splat + %tr = trunc %shr to + ret %tr +} + +define @umulh_i16( %a, %b) #0 { +; CHECK-LABEL: umulh_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: umulh z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %insert = insertelement undef, i32 16, i64 0 + %splat = shufflevector %insert, undef, zeroinitializer + %1 = zext %a to + %2 = zext %b to + %mul = mul %1, %2 + %shr = lshr %mul, %splat + %tr = trunc %shr to + ret %tr +} + +define @umulh_i32( %a, %b) #0 { +; CHECK-LABEL: umulh_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: umulh z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %insert = insertelement undef, i64 32, i64 0 + %splat = shufflevector %insert, undef, zeroinitializer + %1 = zext %a to + %2 = zext %b to + %mul = mul %1, %2 + %shr = lshr %mul, %splat + %tr = trunc %shr to + ret %tr +} + +define @umulh_i64( %a, %b) #0 { +; CHECK-LABEL: umulh_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: umulh z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %insert = insertelement undef, i128 64, i64 0 + %splat = shufflevector %insert, undef, zeroinitializer + %1 = zext %a to + %2 = zext %b to + %mul = mul %1, %2 + %shr = lshr %mul, %splat + %tr = trunc %shr to + ret %tr +} + +attributes #0 = { "target-features"="+sve2" } -- 2.7.4