bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm);
bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
SDValue &Offset);
+
+ bool SelectAllActivePredicate(SDValue N);
};
} // end anonymous namespace
return false;
}
+
+bool AArch64DAGToDAGISel::SelectAllActivePredicate(SDValue N) {
+ const AArch64TargetLowering *TLI =
+ static_cast<const AArch64TargetLowering *>(getTargetLowering());
+
+ return TLI->isAllActivePredicate(N);
+}
Zero);
}
+static bool isAllActivePredicate(SDValue N) {
+ unsigned NumElts = N.getValueType().getVectorMinNumElements();
+
+ // Look through cast.
+ while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
+ N = N.getOperand(0);
+ // When reinterpreting from a type with fewer elements the "new" elements
+ // are not active, so bail if they're likely to be used.
+ if (N.getValueType().getVectorMinNumElements() < NumElts)
+ return false;
+ }
+
+ // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
+ // or smaller than the implicit element type represented by N.
+ // NOTE: A larger element count implies a smaller element type.
+ if (N.getOpcode() == AArch64ISD::PTRUE &&
+ N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
+ return N.getValueType().getVectorMinNumElements() >= NumElts;
+
+ return false;
+}
+
// If a merged operation has no inactive lanes we can relax it to a predicated
// or unpredicated operation, which potentially allows better isel (perhaps
// using immediate forms) or relaxing register reuse requirements.
SDValue Pg = N->getOperand(1);
// ISD way to specify an all active predicate.
- if ((Pg.getOpcode() == AArch64ISD::PTRUE) &&
- (Pg.getConstantOperandVal(0) == AArch64SVEPredPattern::all))
+ if (isAllActivePredicate(Pg))
return DAG.getNode(PredOpc, SDLoc(N), N->getValueType(0), Pg,
N->getOperand(2), N->getOperand(3));
N->getOperand(1));
case Intrinsic::aarch64_sve_ext:
return LowerSVEIntrinsicEXT(N, DAG);
+ case Intrinsic::aarch64_sve_mul:
+ return convertMergedOpToPredOp(N, AArch64ISD::MUL_PRED, DAG);
+ case Intrinsic::aarch64_sve_smulh:
+ return convertMergedOpToPredOp(N, AArch64ISD::MULHS_PRED, DAG);
+ case Intrinsic::aarch64_sve_umulh:
+ return convertMergedOpToPredOp(N, AArch64ISD::MULHU_PRED, DAG);
case Intrinsic::aarch64_sve_smin:
return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG);
case Intrinsic::aarch64_sve_umin:
return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG);
case Intrinsic::aarch64_sve_asr:
return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG);
+ case Intrinsic::aarch64_sve_fadd:
+ return convertMergedOpToPredOp(N, AArch64ISD::FADD_PRED, DAG);
+ case Intrinsic::aarch64_sve_fsub:
+ return convertMergedOpToPredOp(N, AArch64ISD::FSUB_PRED, DAG);
+ case Intrinsic::aarch64_sve_fmul:
+ return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG);
case Intrinsic::aarch64_sve_cmphs:
if (!N->getOperand(2).getValueType().isFloatingPoint())
return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
return Op;
}
+
+bool AArch64TargetLowering::isAllActivePredicate(SDValue N) const {
+ return ::isAllActivePredicate(N);
+}
return 128;
}
+ bool isAllActivePredicate(SDValue N) const;
+
private:
/// Keep a pointer to the AArch64Subtarget around so that we can
/// make the right decision when generating code for different targets.
defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_p>;
defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_p>;
- defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">;
- defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">;
- defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">;
+ defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr", int_aarch64_sve_asr_wide>;
+ defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr", int_aarch64_sve_lsr_wide>;
+ defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl", int_aarch64_sve_lsl_wide>;
// Predicated shifts
defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0000, "asr", "ASR_ZPZI", int_aarch64_sve_asr>;
defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh", null_frag, AArch64umulh_p>;
defm PMUL_ZZZ : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>;
- // Add patterns for unpredicated version of smulh and umulh.
- def : Pat<(nxv16i8 (int_aarch64_sve_smulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
- (SMULH_ZZZ_B $Op1, $Op2)>;
- def : Pat<(nxv8i16 (int_aarch64_sve_smulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)),
- (SMULH_ZZZ_H $Op1, $Op2)>;
- def : Pat<(nxv4i32 (int_aarch64_sve_smulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)),
- (SMULH_ZZZ_S $Op1, $Op2)>;
- def : Pat<(nxv2i64 (int_aarch64_sve_smulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
- (SMULH_ZZZ_D $Op1, $Op2)>;
- def : Pat<(nxv16i8 (int_aarch64_sve_umulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
- (UMULH_ZZZ_B $Op1, $Op2)>;
- def : Pat<(nxv8i16 (int_aarch64_sve_umulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)),
- (UMULH_ZZZ_H $Op1, $Op2)>;
- def : Pat<(nxv4i32 (int_aarch64_sve_umulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)),
- (UMULH_ZZZ_S $Op1, $Op2)>;
- def : Pat<(nxv2i64 (int_aarch64_sve_umulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
- (UMULH_ZZZ_D $Op1, $Op2)>;
-
// SVE2 complex integer dot product (indexed)
defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot", int_aarch64_sve_cdot_lane>;
def SVEShiftImmR32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 32, true>", []>;
def SVEShiftImmR64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 64, true>", []>;
+def SVEAllActive : ComplexPattern<untyped, 0, "SelectAllActivePredicate", []>;
+
class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass {
let Name = "SVEExactFPImmOperand" # Suffix;
let DiagnosticType = "Invalid" # Name;
: Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))))),
(inst $Op1, i32:$imm, i32:$shift)>;
-class SVE_1_Op_Imm_Arith_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op,
+class SVE_1_Op_Imm_Arith_All_Active<ValueType vt, ValueType pt, SDPatternOperator op,
ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst>
- : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
+ : Pat<(vt (op (pt (SVEAllActive)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
(inst $Op1, i32:$imm)>;
class SVE_1_Op_Imm_Log_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
class SVE_2_Op_Pred_All_Active<ValueType vtd, SDPatternOperator op,
ValueType pt, ValueType vt1, ValueType vt2,
Instruction inst>
-: Pat<(vtd (op (pt (AArch64ptrue 31)), vt1:$Op1, vt2:$Op2)),
+: Pat<(vtd (op (pt (SVEAllActive)), vt1:$Op1, vt2:$Op2)),
(inst $Op1, $Op2)>;
class SVE_3_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
class SVE_Shift_DupImm_All_Active_Pat<ValueType vt, SDPatternOperator op,
ValueType pt, ValueType it,
ComplexPattern cast, Instruction inst>
-: Pat<(vt (op (pt (AArch64ptrue 31)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))),
+: Pat<(vt (op (pt (SVEAllActive)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))),
(inst $Rn, i32:$imm)>;
//
def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, simm8>;
def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, simm8>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperator op> {
def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, imm0_255>;
def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, imm0_255>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImm8Pat, !cast<Instruction>(NAME # _B)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImm16Pat, !cast<Instruction>(NAME # _H)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImm32Pat, !cast<Instruction>(NAME # _S)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImm64Pat, !cast<Instruction>(NAME # _D)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImm8Pat, !cast<Instruction>(NAME # _B)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImm16Pat, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImm32Pat, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImm64Pat, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> {
def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>;
def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
let Inst{4-0} = Zd;
}
-multiclass sve_int_bin_cons_shift_wide<bits<2> opc, string asm> {
+multiclass sve_int_bin_cons_shift_wide<bits<2> opc, string asm, SDPatternOperator op> {
def _B : sve_int_bin_cons_shift_wide<0b00, opc, asm, ZPR8>;
def _H : sve_int_bin_cons_shift_wide<0b01, opc, asm, ZPR16>;
def _S : sve_int_bin_cons_shift_wide<0b10, opc, asm, ZPR32>;
+
+ def : SVE_2_Op_Pred_All_Active<nxv16i8, op, nxv16i1, nxv16i8, nxv2i64, !cast<Instruction>(NAME # _B)>;
+ def : SVE_2_Op_Pred_All_Active<nxv8i16, op, nxv8i1, nxv8i16, nxv2i64, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pred_All_Active<nxv4i32, op, nxv4i1, nxv4i32, nxv2i64, !cast<Instruction>(NAME # _S)>;
}
class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
ret <vscale x 2 x i64> %out
}
+; As smax_i32 but where pg is i8 based and thus compatible for i32.
+define <vscale x 4 x i32> @smax_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: smax_i32_ptrue_all_b:
+; CHECK: smax z0.s, z0.s, #1
+; CHECK-NEXT: ret
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smax.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+; As smax_i32 but where pg is i16 based and thus compatible for i32.
+define <vscale x 4 x i32> @smax_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: smax_i32_ptrue_all_h:
+; CHECK: smax z0.s, z0.s, #1
+; CHECK-NEXT: ret
+ %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smax.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+; As smax_i32 but where pg is i64 based, which is not compatibile for i32 and
+; thus inactive lanes are important and the immediate form cannot be used.
+define <vscale x 4 x i32> @smax_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: smax_i32_ptrue_all_d:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
+; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1
+; CHECK-DAG: smax z0.s, [[PG]]/m, z0.s, [[DUP]].s
+; CHECK-NEXT: ret
+ %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smax.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
; SMIN
ret <vscale x 2 x i64> %out
}
+; As smin_i32 but where pg is i8 based and thus compatible for i32.
+define <vscale x 4 x i32> @smin_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: smin_i32_ptrue_all_b:
+; CHECK: smin z0.s, z0.s, #1
+; CHECK-NEXT: ret
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smin.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+; As smin_i32 but where pg is i16 based and thus compatible for i32.
+define <vscale x 4 x i32> @smin_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: smin_i32_ptrue_all_h:
+; CHECK: smin z0.s, z0.s, #1
+; CHECK-NEXT: ret
+ %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smin.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+; As smin_i32 but where pg is i64 based, which is not compatibile for i32 and
+; thus inactive lanes are important and the immediate form cannot be used.
+define <vscale x 4 x i32> @smin_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: smin_i32_ptrue_all_d:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
+; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1
+; CHECK-DAG: smin z0.s, [[PG]]/m, z0.s, [[DUP]].s
+; CHECK-NEXT: ret
+ %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smin.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
; UMAX
define <vscale x 16 x i8> @umax_i8(<vscale x 16 x i8> %a) {
ret <vscale x 2 x i64> %out
}
+; As umax_i32 but where pg is i8 based and thus compatible for i32.
+define <vscale x 4 x i32> @umax_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: umax_i32_ptrue_all_b:
+; CHECK: umax z0.s, z0.s, #1
+; CHECK-NEXT: ret
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umax.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+; As umax_i32 but where pg is i16 based and thus compatible for i32.
+define <vscale x 4 x i32> @umax_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: umax_i32_ptrue_all_h:
+; CHECK: umax z0.s, z0.s, #1
+; CHECK-NEXT: ret
+ %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umax.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+; As umax_i32 but where pg is i64 based, which is not compatibile for i32 and
+; thus inactive lanes are important and the immediate form cannot be used.
+define <vscale x 4 x i32> @umax_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: umax_i32_ptrue_all_d:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
+; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1
+; CHECK-DAG: umax z0.s, [[PG]]/m, z0.s, [[DUP]].s
+; CHECK-NEXT: ret
+ %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umax.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
; UMIN
define <vscale x 16 x i8> @umin_i8(<vscale x 16 x i8> %a) {
ret <vscale x 2 x i64> %out
}
+; As umin_i32 but where pg is i8 based and thus compatible for i32.
+define <vscale x 4 x i32> @umin_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: umin_i32_ptrue_all_b:
+; CHECK: umin z0.s, z0.s, #1
+; CHECK-NEXT: ret
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umin.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+; As umin_i32 but where pg is i16 based and thus compatible for i32.
+define <vscale x 4 x i32> @umin_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: umin_i32_ptrue_all_h:
+; CHECK: umin z0.s, z0.s, #1
+; CHECK-NEXT: ret
+ %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umin.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+; As umin_i32 but where pg is i64 based, which is not compatibile for i32 and
+; thus inactive lanes are important and the immediate form cannot be used.
+define <vscale x 4 x i32> @umin_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: umin_i32_ptrue_all_d:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
+; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1
+; CHECK-DAG: umin z0.s, [[PG]]/m, z0.s, [[DUP]].s
+; CHECK-NEXT: ret
+ %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umin.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
; SQADD
define <vscale x 16 x i8> @sqadd_b_lowimm(<vscale x 16 x i8> %a) {
ret <vscale x 4 x i32> %out
}
+define <vscale x 4 x i32> @uqadd_s_highimm(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: uqadd_s_highimm:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uqadd z0.s, z0.s, #8192 // =0x2000
+; CHECK-NEXT: ret
+ %elt = insertelement <vscale x 4 x i32> undef, i32 8192, i32 0
+ %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uqadd.x.nxv4i32(<vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %splat)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @uqadd_d_lowimm(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: uqadd_d_lowimm:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uqadd z0.d, z0.d, #255 // =0xff
+; CHECK-NEXT: ret
+ %elt = insertelement <vscale x 2 x i64> undef, i64 255, i32 0
+ %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqadd.x.nxv2i64(<vscale x 2 x i64> %a,
+ <vscale x 2 x i64> %splat)
+ ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @uqadd_d_highimm(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: uqadd_d_highimm:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uqadd z0.d, z0.d, #65280 // =0xff00
+; CHECK-NEXT: ret
+ %elt = insertelement <vscale x 2 x i64> undef, i64 65280, i32 0
+ %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqadd.x.nxv2i64(<vscale x 2 x i64> %a,
+ <vscale x 2 x i64> %splat)
+ ret <vscale x 2 x i64> %out
+}
+
; UQSUB
define <vscale x 16 x i8> @uqsub_b_lowimm(<vscale x 16 x i8> %a) {
ret <vscale x 2 x i64> %out
}
-
-define <vscale x 4 x i32> @uqadd_s_highimm(<vscale x 4 x i32> %a) {
-; CHECK-LABEL: uqadd_s_highimm:
-; CHECK: // %bb.0:
-; CHECK-NEXT: uqadd z0.s, z0.s, #8192 // =0x2000
-; CHECK-NEXT: ret
- %elt = insertelement <vscale x 4 x i32> undef, i32 8192, i32 0
- %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
- %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uqadd.x.nxv4i32(<vscale x 4 x i32> %a,
- <vscale x 4 x i32> %splat)
- ret <vscale x 4 x i32> %out
-}
-
-define <vscale x 2 x i64> @uqadd_d_lowimm(<vscale x 2 x i64> %a) {
-; CHECK-LABEL: uqadd_d_lowimm:
-; CHECK: // %bb.0:
-; CHECK-NEXT: uqadd z0.d, z0.d, #255 // =0xff
-; CHECK-NEXT: ret
- %elt = insertelement <vscale x 2 x i64> undef, i64 255, i32 0
- %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
- %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqadd.x.nxv2i64(<vscale x 2 x i64> %a,
- <vscale x 2 x i64> %splat)
- ret <vscale x 2 x i64> %out
-}
-
-define <vscale x 2 x i64> @uqadd_d_highimm(<vscale x 2 x i64> %a) {
-; CHECK-LABEL: uqadd_d_highimm:
-; CHECK: // %bb.0:
-; CHECK-NEXT: uqadd z0.d, z0.d, #65280 // =0xff00
-; CHECK-NEXT: ret
- %elt = insertelement <vscale x 2 x i64> undef, i64 65280, i32 0
- %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
- %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqadd.x.nxv2i64(<vscale x 2 x i64> %a,
- <vscale x 2 x i64> %splat)
- ret <vscale x 2 x i64> %out
-}
-
; ASR
define <vscale x 16 x i8> @asr_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
ret <vscale x 2 x i64> %out
}
+; As lsr_i32 but where pg is i8 based and thus compatible for i32.
+define <vscale x 4 x i32> @lsr_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: lsr_i32_ptrue_all_b:
+; CHECK: lsr z0.s, z0.s, #1
+; CHECK-NEXT: ret
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+; As lsr_i32 but where pg is i16 based and thus compatible for i32.
+define <vscale x 4 x i32> @lsr_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: lsr_i32_ptrue_all_h:
+; CHECK: lsr z0.s, z0.s, #1
+; CHECK-NEXT: ret
+ %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+; As lsr_i32 but where pg is i64 based, which is not compatibile for i32 and
+; thus inactive lanes are important and the immediate form cannot be used.
+define <vscale x 4 x i32> @lsr_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: lsr_i32_ptrue_all_d:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
+; CHECK-DAG: lsr z0.s, [[PG]]/m, z0.s, #1
+; CHECK-NEXT: ret
+ %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+;
+; MUL
+;
+
+; As mul_i32 but where pg is i8 based and thus compatible for i32.
+define <vscale x 4 x i32> @mul_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: mul_i32_ptrue_all_b:
+; CHECK: mul z0.s, z0.s, #1
+; CHECK-NEXT: ret
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+; As mul_i32 but where pg is i16 based and thus compatible for i32.
+define <vscale x 4 x i32> @mul_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: mul_i32_ptrue_all_h:
+; CHECK: mul z0.s, z0.s, #1
+; CHECK-NEXT: ret
+ %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+; As mul_i32 but where pg is i64 based, which is not compatibile for i32 and
+; thus inactive lanes are important and the immediate form cannot be used.
+define <vscale x 4 x i32> @mul_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: mul_i32_ptrue_all_d:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
+; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1
+; CHECK-DAG: mul z0.s, [[PG]]/m, z0.s, [[DUP]].s
+; CHECK-NEXT: ret
+ %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
declare <vscale x 16 x i8> @llvm.aarch64.sve.sqadd.x.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 8 x i16> @llvm.aarch64.sve.sqadd.x.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.sqadd.x.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.lsr.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 16 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
+
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
+
declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 %pattern)
declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 %pattern)
declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 %pattern)
--- /dev/null
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; MUL
+;
+
+define <vscale x 16 x i8> @mul_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
+; CHECK-LABEL: mul_i8:
+; CHECK: mul z0.b, z0.b, z1.b
+; CHECK-NEXT: ret
+ %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %out = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> %pg,
+ <vscale x 16 x i8> %a,
+ <vscale x 16 x i8> %b)
+ ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @mul_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
+; CHECK-LABEL: mul_i16:
+; CHECK: mul z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x i16> %a,
+ <vscale x 8 x i16> %b)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @mul_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: mul_i32:
+; CHECK: mul z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @mul_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: mul_i64:
+; CHECK: mul z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+ %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %a,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 2 x i64> %out
+}
+
+;
+; SMULH
+;
+
+define <vscale x 16 x i8> @smulh_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
+; CHECK-LABEL: smulh_i8:
+; CHECK: smulh z0.b, z0.b, z1.b
+; CHECK-NEXT: ret
+ %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %out = call <vscale x 16 x i8> @llvm.aarch64.sve.smulh.nxv16i8(<vscale x 16 x i1> %pg,
+ <vscale x 16 x i8> %a,
+ <vscale x 16 x i8> %b)
+ ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @smulh_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
+; CHECK-LABEL: smulh_i16:
+; CHECK: smulh z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.smulh.nxv8i16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x i16> %a,
+ <vscale x 8 x i16> %b)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @smulh_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: smulh_i32:
+; CHECK: smulh z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.smulh.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @smulh_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: smulh_i64:
+; CHECK: smulh z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+ %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.smulh.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %a,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 2 x i64> %out
+}
+
+;
+; UMULH
+;
+
+define <vscale x 16 x i8> @umulh_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
+; CHECK-LABEL: umulh_i8:
+; CHECK: umulh z0.b, z0.b, z1.b
+; CHECK-NEXT: ret
+ %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %out = call <vscale x 16 x i8> @llvm.aarch64.sve.umulh.nxv16i8(<vscale x 16 x i1> %pg,
+ <vscale x 16 x i8> %a,
+ <vscale x 16 x i8> %b)
+ ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @umulh_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
+; CHECK-LABEL: umulh_i16:
+; CHECK: umulh z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.umulh.nxv8i16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x i16> %a,
+ <vscale x 8 x i16> %b)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @umulh_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: umulh_i32:
+; CHECK: umulh z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.umulh.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @umulh_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: umulh_i64:
+; CHECK: umulh z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+ %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.umulh.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %a,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 2 x i64> %out
+}
+
+; As umulh_i32 but where pg is i8 based and thus compatible for i32.
+define <vscale x 4 x i32> @umulh_i32_ptrue_all_b(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: umulh_i32_ptrue_all_b:
+; CHECK: umulh z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umulh.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+; As umulh_i32 but where pg is i16 based and thus compatible for i32.
+define <vscale x 4 x i32> @umulh_i32_ptrue_all_h(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: umulh_i32_ptrue_all_h:
+; CHECK: umulh z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umulh.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+; As umulh_i32 but where pg is i64 based, which is not compatibile for i32 and
+; thus inactive lanes are important and the immediate form cannot be used.
+define <vscale x 4 x i32> @umulh_i32_ptrue_all_d(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: umulh_i32_ptrue_all_d:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
+; CHECK-DAG: umulh z0.s, [[PG]]/m, z0.s, z1.s
+; CHECK-NEXT: ret
+ %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umulh.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+;
+; ASR (wide)
+;
+
+define <vscale x 16 x i8> @asr_i8(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: asr_i8:
+; CHECK: asr z0.b, z0.b, z1.d
+; CHECK-NEXT: ret
+ %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %out = call <vscale x 16 x i8> @llvm.aarch64.sve.asr.wide.nxv16i8(<vscale x 16 x i1> %pg,
+ <vscale x 16 x i8> %a,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @asr_i16(<vscale x 8 x i16> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: asr_i16:
+; CHECK: asr z0.h, z0.h, z1.d
+; CHECK-NEXT: ret
+ %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.asr.wide.nxv8i16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x i16> %a,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @asr_i32(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: asr_i32:
+; CHECK: asr z0.s, z0.s, z1.d
+; CHECK-NEXT: ret
+ %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.asr.wide.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %a,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+;
+; LSL (wide)
+;
+
+define <vscale x 16 x i8> @lsl_i8(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsl_i8:
+; CHECK: lsl z0.b, z0.b, z1.d
+; CHECK-NEXT: ret
+ %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %out = call <vscale x 16 x i8> @llvm.aarch64.sve.lsl.wide.nxv16i8(<vscale x 16 x i1> %pg,
+ <vscale x 16 x i8> %a,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @lsl_i16(<vscale x 8 x i16> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsl_i16:
+; CHECK: lsl z0.h, z0.h, z1.d
+; CHECK-NEXT: ret
+ %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.wide.nxv8i16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x i16> %a,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @lsl_i32(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsl_i32:
+; CHECK: lsl z0.s, z0.s, z1.d
+; CHECK-NEXT: ret
+ %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.lsl.wide.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %a,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+;
+; LSR (wide)
+;
+
+define <vscale x 16 x i8> @lsr_i8(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsr_i8:
+; CHECK: lsr z0.b, z0.b, z1.d
+; CHECK-NEXT: ret
+ %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %out = call <vscale x 16 x i8> @llvm.aarch64.sve.lsr.wide.nxv16i8(<vscale x 16 x i1> %pg,
+ <vscale x 16 x i8> %a,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @lsr_i16(<vscale x 8 x i16> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsr_i16:
+; CHECK: lsr z0.h, z0.h, z1.d
+; CHECK-NEXT: ret
+ %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.lsr.wide.nxv8i16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x i16> %a,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @lsr_i32(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsr_i32:
+; CHECK: lsr z0.s, z0.s, z1.d
+; CHECK-NEXT: ret
+ %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %a,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+; As lsr_i32 but where pg is i8 based and thus compatible for i32.
+define <vscale x 4 x i32> @lsr_i32_ptrue_all_b(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsr_i32_ptrue_all_b:
+; CHECK: lsr z0.s, z0.s, z1.d
+; CHECK-NEXT: ret
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+; As lsr_i32 but where pg is i16 based and thus compatible for i32.
+define <vscale x 4 x i32> @lsr_i32_ptrue_all_h(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsr_i32_ptrue_all_h:
+; CHECK: lsr z0.s, z0.s, z1.d
+; CHECK-NEXT: ret
+ %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+; As lsr_i32 but where pg is i64 based, which is not compatibile for i32 and
+; thus inactive lanes are important and the immediate form cannot be used.
+define <vscale x 4 x i32> @lsr_i32_ptrue_all_d(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: lsr_i32_ptrue_all_d:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
+; CHECK-DAG: lsr z0.s, [[PG]]/m, z0.s, z1.d
+; CHECK-NEXT: ret
+ %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
+ %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
+ %out = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1> %pg.s,
+ <vscale x 4 x i32> %a,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+;
+; FADD
+;
+
+define <vscale x 8 x half> @fadd_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: fadd_half:
+; CHECK: fadd z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %out = call <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x half> %a,
+ <vscale x 8 x half> %b)
+ ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fadd_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: fadd_float:
+; CHECK: fadd z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %out = call <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x float> %a,
+ <vscale x 4 x float> %b)
+ ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fadd_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: fadd_double:
+; CHECK: fadd z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+ %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %out = call <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x double> %a,
+ <vscale x 2 x double> %b)
+ ret <vscale x 2 x double> %out
+}
+
+;
+; FSUB
+;
+
+define <vscale x 8 x half> @fsub_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: fsub_half:
+; CHECK: fsub z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %out = call <vscale x 8 x half> @llvm.aarch64.sve.fsub.nxv8f16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x half> %a,
+ <vscale x 8 x half> %b)
+ ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fsub_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: fsub_float:
+; CHECK: fsub z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %out = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x float> %a,
+ <vscale x 4 x float> %b)
+ ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fsub_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: fsub_double:
+; CHECK: fsub z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+ %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %out = call <vscale x 2 x double> @llvm.aarch64.sve.fsub.nxv2f64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x double> %a,
+ <vscale x 2 x double> %b)
+ ret <vscale x 2 x double> %out
+}
+
+;
+; FMUL
+;
+
+define <vscale x 8 x half> @fmul_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: fmul_half:
+; CHECK: fmul z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %out = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x half> %a,
+ <vscale x 8 x half> %b)
+ ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fmul_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: fmul_float:
+; CHECK: fmul z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x float> %a,
+ <vscale x 4 x float> %b)
+ ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fmul_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: fmul_double:
+; CHECK: fmul z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+ %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %out = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x double> %a,
+ <vscale x 2 x double> %b)
+ ret <vscale x 2 x double> %out
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.smulh.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.smulh.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.smulh.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.smulh.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.umulh.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.umulh.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.umulh.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.umulh.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.asr.wide.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 2 x i64>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.asr.wide.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 2 x i64>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.asr.wide.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 2 x i64>)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.lsl.wide.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 2 x i64>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.lsl.wide.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 2 x i64>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.lsl.wide.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 2 x i64>)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.lsr.wide.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 2 x i64>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.lsr.wide.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 2 x i64>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 2 x i64>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fsub.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fsub.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 16 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32)
+
+attributes #0 = { "target-features"="+sve2" }