defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd", int_aarch64_sve_fcadd>;
defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla", int_aarch64_sve_fcmla>;
- defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla", int_aarch64_sve_fmla>;
- defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls", int_aarch64_sve_fmls>;
- defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla", int_aarch64_sve_fnmla>;
- defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls", int_aarch64_sve_fnmls>;
-
- defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad", int_aarch64_sve_fmad>;
- defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb", int_aarch64_sve_fmsb>;
- defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad>;
- defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb>;
-
- // Add patterns for FMA where disabled lanes are undef.
- // FIXME: Implement a pseudo so we can choose a better instruction after
- // regalloc.
- def : Pat<(nxv8f16 (AArch64fma_p nxv8i1:$P, nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3)),
- (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
- def : Pat<(nxv4f16 (AArch64fma_p nxv4i1:$P, nxv4f16:$Op1, nxv4f16:$Op2, nxv4f16:$Op3)),
- (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
- def : Pat<(nxv2f16 (AArch64fma_p nxv2i1:$P, nxv2f16:$Op1, nxv2f16:$Op2, nxv2f16:$Op3)),
- (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
- def : Pat<(nxv4f32 (AArch64fma_p nxv4i1:$P, nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3)),
- (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>;
- def : Pat<(nxv2f32 (AArch64fma_p nxv2i1:$P, nxv2f32:$Op1, nxv2f32:$Op2, nxv2f32:$Op3)),
- (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>;
- def : Pat<(nxv2f64 (AArch64fma_p nxv2i1:$P, nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3)),
- (FMLA_ZPmZZ_D $P, $Op3, $Op1, $Op2)>;
+ defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla", "FMLA_ZPZZZ", int_aarch64_sve_fmla, "FMAD_ZPmZZ">;
+ defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls", "FMLS_ZPZZZ", int_aarch64_sve_fmls, "FMSB_ZPmZZ">;
+ defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla", "FNMLA_ZPZZZ", int_aarch64_sve_fnmla, "FNMAD_ZPmZZ">;
+ defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls", "FNMLS_ZPZZZ", int_aarch64_sve_fnmls, "FNMSB_ZPmZZ">;
+
+ defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad", int_aarch64_sve_fmad, "FMLA_ZPmZZ", /*isReverseInstr*/ 1>;
+ defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb", int_aarch64_sve_fmsb, "FMLS_ZPmZZ", /*isReverseInstr*/ 1>;
+ defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad, "FNMLA_ZPmZZ", /*isReverseInstr*/ 1>;
+ defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb, "FNMLS_ZPmZZ", /*isReverseInstr*/ 1>;
+
+ defm FMLA_ZPZZZ : sve_fp_3op_p_zds_zx<int_aarch64_sve_fmla, int_aarch64_sve_fmad>;
+ defm FMLS_ZPZZZ : sve_fp_3op_p_zds_zx<int_aarch64_sve_fmls, int_aarch64_sve_fmsb>;
+ defm FNMLA_ZPZZZ : sve_fp_3op_p_zds_zx<int_aarch64_sve_fnmla, int_aarch64_sve_fnmad>;
+ defm FNMLS_ZPZZZ : sve_fp_3op_p_zds_zx<int_aarch64_sve_fnmls, int_aarch64_sve_fnmsb>;
+
+ multiclass fma<ValueType Ty, ValueType PredTy, string Suffix> {
+ // Zd = Za + Zn * Zm
+ def : Pat<(Ty (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, Ty:$Za)),
+ (!cast<Instruction>("FMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zd = Za + -Zn * Zm
+ def : Pat<(Ty (AArch64fma_p PredTy:$P, (AArch64fneg_mt PredTy:$P, Ty:$Zn, (Ty (undef))), Ty:$Zm, Ty:$Za)),
+ (!cast<Instruction>("FMLS_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zd = -Za + Zn * Zm
+ def : Pat<(Ty (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, (AArch64fneg_mt PredTy:$P, Ty:$Za, (Ty (undef))))),
+ (!cast<Instruction>("FNMLS_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zd = -Za + -Zn * Zm
+ def : Pat<(Ty (AArch64fma_p PredTy:$P, (AArch64fneg_mt PredTy:$P, Ty:$Zn, (Ty (undef))), Ty:$Zm, (AArch64fneg_mt PredTy:$P, Ty:$Za, (Ty (undef))))),
+ (!cast<Instruction>("FNMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zd = -(Za + Zn * Zm)
+ def : Pat<(AArch64fneg_mt PredTy:$P, (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, Ty:$Za), (Ty (undef))),
+ (!cast<Instruction>("FNMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zda = Zda + Zn * Zm
+ def : Pat<(vselect (PredTy PPR:$Pg), (Ty (AArch64fma_p (PredTy (AArch64ptrue 31)), ZPR:$Zn, ZPR:$Zm, ZPR:$Za)), ZPR:$Za),
+ (!cast<Instruction>("FMLA_ZPmZZ_"#Suffix) PPR:$Pg, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zda = Zda + -Zn * Zm
+ def : Pat<(vselect (PredTy PPR:$Pg), (Ty (AArch64fma_p (PredTy (AArch64ptrue 31)), (AArch64fneg_mt (PredTy (AArch64ptrue 31)), Ty:$Zn, (Ty (undef))), ZPR:$Zm, ZPR:$Za)), ZPR:$Za),
+ (!cast<Instruction>("FMLS_ZPmZZ_"#Suffix) PPR:$Pg, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+ }
+
+ defm : fma<nxv8f16, nxv8i1, "H">;
+ defm : fma<nxv4f16, nxv4i1, "H">;
+ defm : fma<nxv2f16, nxv2i1, "H">;
+ defm : fma<nxv4f32, nxv4i1, "S">;
+ defm : fma<nxv2f32, nxv2i1, "S">;
+ defm : fma<nxv2f64, nxv2i1, "D">;
defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>;
; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[OP3]].h }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h
+; CHECK: st1h { [[OP1]].h }, [[PG]], [x0]
; CHECK: ret
%op1 = load <16 x half>, <16 x half>* %a
%op2 = load <16 x half>, <16 x half>* %b
; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[OP3]].h }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h
+; CHECK: st1h { [[OP1]].h }, [[PG]], [x0]
; CHECK: ret
%op1 = load <32 x half>, <32 x half>* %a
%op2 = load <32 x half>, <32 x half>* %b
; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[OP3]].h }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h
+; CHECK: st1h { [[OP1]].h }, [[PG]], [x0]
; CHECK: ret
%op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b
; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[OP3]].h }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h
+; CHECK: st1h { [[OP1]].h }, [[PG]], [x0]
; CHECK: ret
%op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b
; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[OP3]].s }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s
+; CHECK: st1w { [[OP1]].s }, [[PG]], [x0]
; CHECK: ret
%op1 = load <8 x float>, <8 x float>* %a
%op2 = load <8 x float>, <8 x float>* %b
; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[OP3]].s }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s
+; CHECK: st1w { [[OP1]].s }, [[PG]], [x0]
; CHECK: ret
%op1 = load <16 x float>, <16 x float>* %a
%op2 = load <16 x float>, <16 x float>* %b
; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[OP3]].s }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s
+; CHECK: st1w { [[OP1]].s }, [[PG]], [x0]
; CHECK: ret
%op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b
; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[OP3]].s }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s
+; CHECK: st1w { [[OP1]].s }, [[PG]], [x0]
; CHECK: ret
%op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b
; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[OP3]].d }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d
+; CHECK: st1d { [[OP1]].d }, [[PG]], [x0]
; CHECK: ret
%op1 = load <4 x double>, <4 x double>* %a
%op2 = load <4 x double>, <4 x double>* %b
; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[OP3]].d }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d
+; CHECK: st1d { [[OP1]].d }, [[PG]], [x0]
; CHECK: ret
%op1 = load <8 x double>, <8 x double>* %a
%op2 = load <8 x double>, <8 x double>* %b
; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[OP3]].d }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d
+; CHECK: st1d { [[OP1]].d }, [[PG]], [x0]
; CHECK: ret
%op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b
; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[OP3]].d }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d
+; CHECK: st1d { [[OP1]].d }, [[PG]], [x0]
; CHECK: ret
%op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 8 x half> @fmla_h_sel(<vscale x 8 x i1> %pred, <vscale x 8 x half> %acc, <vscale x 8 x half> %m1, <vscale x 8 x half> %m2) {
+; CHECK-LABEL: fmla_h_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %add = fadd fast <vscale x 8 x half> %acc, %mul
+ %res = select <vscale x 8 x i1> %pred, <vscale x 8 x half> %add, <vscale x 8 x half> %acc
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fmla_hx4_sel(<vscale x 4 x i1> %pred, <vscale x 4 x half> %acc, <vscale x 4 x half> %m1, <vscale x 4 x half> %m2) {
+; CHECK-LABEL: fmla_hx4_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %add = fadd fast <vscale x 4 x half> %acc, %mul
+ %res = select <vscale x 4 x i1> %pred, <vscale x 4 x half> %add, <vscale x 4 x half> %acc
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fmla_hx2_sel(<vscale x 2 x i1> %pred, <vscale x 2 x half> %acc, <vscale x 2 x half> %m1, <vscale x 2 x half> %m2) {
+; CHECK-LABEL: fmla_hx2_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %add = fadd fast <vscale x 2 x half> %acc, %mul
+ %res = select <vscale x 2 x i1> %pred, <vscale x 2 x half> %add, <vscale x 2 x half> %acc
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fmla_s_sel(<vscale x 4 x i1> %pred, <vscale x 4 x float> %acc, <vscale x 4 x float> %m1, <vscale x 4 x float> %m2) {
+; CHECK-LABEL: fmla_s_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %add = fadd fast <vscale x 4 x float> %acc, %mul
+ %res = select <vscale x 4 x i1> %pred, <vscale x 4 x float> %add, <vscale x 4 x float> %acc
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fmla_sx2_sel(<vscale x 2 x i1> %pred, <vscale x 2 x float> %acc, <vscale x 2 x float> %m1, <vscale x 2 x float> %m2) {
+; CHECK-LABEL: fmla_sx2_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %add = fadd fast <vscale x 2 x float> %acc, %mul
+ %res = select <vscale x 2 x i1> %pred, <vscale x 2 x float> %add, <vscale x 2 x float> %acc
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fmla_d_sel(<vscale x 2 x i1> %pred, <vscale x 2 x double> %acc, <vscale x 2 x double> %m1, <vscale x 2 x double> %m2) {
+; CHECK-LABEL: fmla_d_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %add = fadd fast <vscale x 2 x double> %acc, %mul
+ %res = select <vscale x 2 x i1> %pred, <vscale x 2 x double> %add, <vscale x 2 x double> %acc
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fmls_h_sel(<vscale x 8 x i1> %pred, <vscale x 8 x half> %acc, <vscale x 8 x half> %m1, <vscale x 8 x half> %m2) {
+; CHECK-LABEL: fmls_h_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %sub = fsub fast <vscale x 8 x half> %acc, %mul
+ %res = select <vscale x 8 x i1> %pred, <vscale x 8 x half> %sub, <vscale x 8 x half> %acc
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fmls_hx4_sel(<vscale x 4 x i1> %pred, <vscale x 4 x half> %acc, <vscale x 4 x half> %m1, <vscale x 4 x half> %m2) {
+; CHECK-LABEL: fmls_hx4_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %sub = fsub fast <vscale x 4 x half> %acc, %mul
+ %res = select <vscale x 4 x i1> %pred, <vscale x 4 x half> %sub, <vscale x 4 x half> %acc
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fmls_hx2_sel(<vscale x 2 x i1> %pred, <vscale x 2 x half> %acc, <vscale x 2 x half> %m1, <vscale x 2 x half> %m2) {
+; CHECK-LABEL: fmls_hx2_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %sub = fsub fast <vscale x 2 x half> %acc, %mul
+ %res = select <vscale x 2 x i1> %pred, <vscale x 2 x half> %sub, <vscale x 2 x half> %acc
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fmls_s_sel(<vscale x 4 x i1> %pred, <vscale x 4 x float> %acc, <vscale x 4 x float> %m1, <vscale x 4 x float> %m2) {
+; CHECK-LABEL: fmls_s_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmls z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %sub = fsub fast <vscale x 4 x float> %acc, %mul
+ %res = select <vscale x 4 x i1> %pred, <vscale x 4 x float> %sub, <vscale x 4 x float> %acc
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fmls_sx2_sel(<vscale x 2 x i1> %pred, <vscale x 2 x float> %acc, <vscale x 2 x float> %m1, <vscale x 2 x float> %m2) {
+; CHECK-LABEL: fmls_sx2_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmls z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %sub = fsub fast <vscale x 2 x float> %acc, %mul
+ %res = select <vscale x 2 x i1> %pred, <vscale x 2 x float> %sub, <vscale x 2 x float> %acc
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fmls_d_sel(<vscale x 2 x i1> %pred, <vscale x 2 x double> %acc, <vscale x 2 x double> %m1, <vscale x 2 x double> %m2) {
+; CHECK-LABEL: fmls_d_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmls z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %sub = fsub fast <vscale x 2 x double> %acc, %mul
+ %res = select <vscale x 2 x i1> %pred, <vscale x 2 x double> %sub, <vscale x 2 x double> %acc
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fmad_h(<vscale x 8 x half> %m1, <vscale x 8 x half> %m2, <vscale x 8 x half> %acc) {
+; CHECK-LABEL: fmad_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %res = fadd fast <vscale x 8 x half> %acc, %mul
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fmad_hx4(<vscale x 4 x half> %m1, <vscale x 4 x half> %m2, <vscale x 4 x half> %acc) {
+; CHECK-LABEL: fmad_hx4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %res = fadd fast <vscale x 4 x half> %acc, %mul
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fmad_hx2(<vscale x 2 x half> %m1, <vscale x 2 x half> %m2, <vscale x 2 x half> %acc) {
+; CHECK-LABEL: fmad_hx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %res = fadd fast <vscale x 2 x half> %acc, %mul
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fmad_s(<vscale x 4 x float> %m1, <vscale x 4 x float> %m2, <vscale x 4 x float> %acc) {
+; CHECK-LABEL: fmad_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %res = fadd fast <vscale x 4 x float> %acc, %mul
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fmad_sx2(<vscale x 2 x float> %m1, <vscale x 2 x float> %m2, <vscale x 2 x float> %acc) {
+; CHECK-LABEL: fmad_sx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %res = fadd fast <vscale x 2 x float> %acc, %mul
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fmad_d(<vscale x 2 x double> %m1, <vscale x 2 x double> %m2, <vscale x 2 x double> %acc) {
+; CHECK-LABEL: fmad_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %res = fadd fast <vscale x 2 x double> %acc, %mul
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fmla_h(<vscale x 8 x half> %acc, <vscale x 8 x half> %m1, <vscale x 8 x half> %m2) {
+; CHECK-LABEL: fmla_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %res = fadd fast <vscale x 8 x half> %acc, %mul
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fmla_hx4(<vscale x 4 x half> %acc, <vscale x 4 x half> %m1, <vscale x 4 x half> %m2) {
+; CHECK-LABEL: fmla_hx4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %res = fadd fast <vscale x 4 x half> %acc, %mul
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fmla_hx2(<vscale x 2 x half> %acc, <vscale x 2 x half> %m1, <vscale x 2 x half> %m2) {
+; CHECK-LABEL: fmla_hx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %res = fadd fast <vscale x 2 x half> %acc, %mul
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fmla_s(<vscale x 4 x float> %acc, <vscale x 4 x float> %m1, <vscale x 4 x float> %m2) {
+; CHECK-LABEL: fmla_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %res = fadd fast <vscale x 4 x float> %acc, %mul
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fmla_sx2(<vscale x 2 x float> %acc, <vscale x 2 x float> %m1, <vscale x 2 x float> %m2) {
+; CHECK-LABEL: fmla_sx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %res = fadd fast <vscale x 2 x float> %acc, %mul
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fmla_d(<vscale x 2 x double> %acc, <vscale x 2 x double> %m1, <vscale x 2 x double> %m2) {
+; CHECK-LABEL: fmla_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %res = fadd fast <vscale x 2 x double> %acc, %mul
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fmls_h(<vscale x 8 x half> %acc, <vscale x 8 x half> %m1, <vscale x 8 x half> %m2) {
+; CHECK-LABEL: fmls_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %res = fsub fast <vscale x 8 x half> %acc, %mul
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fmls_hx4(<vscale x 4 x half> %acc, <vscale x 4 x half> %m1, <vscale x 4 x half> %m2) {
+; CHECK-LABEL: fmls_hx4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %res = fsub fast <vscale x 4 x half> %acc, %mul
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fmls_hx2(<vscale x 2 x half> %acc, <vscale x 2 x half> %m1, <vscale x 2 x half> %m2) {
+; CHECK-LABEL: fmls_hx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %res = fsub fast <vscale x 2 x half> %acc, %mul
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fmls_s(<vscale x 4 x float> %acc, <vscale x 4 x float> %m1, <vscale x 4 x float> %m2) {
+; CHECK-LABEL: fmls_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmls z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %res = fsub fast <vscale x 4 x float> %acc, %mul
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fmls_sx2(<vscale x 2 x float> %acc, <vscale x 2 x float> %m1, <vscale x 2 x float> %m2) {
+; CHECK-LABEL: fmls_sx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmls z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %res = fsub fast <vscale x 2 x float> %acc, %mul
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fmls_d(<vscale x 2 x double> %acc, <vscale x 2 x double> %m1, <vscale x 2 x double> %m2) {
+; CHECK-LABEL: fmls_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmls z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %res = fsub fast <vscale x 2 x double> %acc, %mul
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fmsb_h(<vscale x 8 x half> %m1, <vscale x 8 x half> %m2, <vscale x 8 x half> %acc) {
+; CHECK-LABEL: fmsb_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fmsb z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %res = fsub fast <vscale x 8 x half> %acc, %mul
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fmsb_hx4(<vscale x 4 x half> %m1, <vscale x 4 x half> %m2, <vscale x 4 x half> %acc) {
+; CHECK-LABEL: fmsb_hx4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmsb z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %res = fsub fast <vscale x 4 x half> %acc, %mul
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fmsb_hx2(<vscale x 2 x half> %m1, <vscale x 2 x half> %m2, <vscale x 2 x half> %acc) {
+; CHECK-LABEL: fmsb_hx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmsb z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %res = fsub fast <vscale x 2 x half> %acc, %mul
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fmsb_s(<vscale x 4 x float> %m1, <vscale x 4 x float> %m2, <vscale x 4 x float> %acc) {
+; CHECK-LABEL: fmsb_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmsb z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %res = fsub fast <vscale x 4 x float> %acc, %mul
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fmsb_sx2(<vscale x 2 x float> %m1, <vscale x 2 x float> %m2, <vscale x 2 x float> %acc) {
+; CHECK-LABEL: fmsb_sx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmsb z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %res = fsub fast <vscale x 2 x float> %acc, %mul
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fmsb_d(<vscale x 2 x double> %m1, <vscale x 2 x double> %m2, <vscale x 2 x double> %acc) {
+; CHECK-LABEL: fmsb_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmsb z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %res = fsub fast <vscale x 2 x double> %acc, %mul
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fnmad_h(<vscale x 8 x half> %m1, <vscale x 8 x half> %m2, <vscale x 8 x half> %acc) {
+; CHECK-LABEL: fnmad_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fnmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 8 x half> %m1
+ %mul = fmul fast <vscale x 8 x half> %neg_m1, %m2
+ %res = fsub fast <vscale x 8 x half> %mul, %acc
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fnmad_hx4(<vscale x 4 x half> %m1, <vscale x 4 x half> %m2, <vscale x 4 x half> %acc) {
+; CHECK-LABEL: fnmad_hx4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 4 x half> %m1
+ %mul = fmul fast <vscale x 4 x half> %neg_m1, %m2
+ %res = fsub fast <vscale x 4 x half> %mul, %acc
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fnmad_hx2(<vscale x 2 x half> %m1, <vscale x 2 x half> %m2, <vscale x 2 x half> %acc) {
+; CHECK-LABEL: fnmad_hx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 2 x half> %m1
+ %mul = fmul fast <vscale x 2 x half> %neg_m1, %m2
+ %res = fsub fast <vscale x 2 x half> %mul, %acc
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fnmad_s(<vscale x 4 x float> %m1, <vscale x 4 x float> %m2, <vscale x 4 x float> %acc) {
+; CHECK-LABEL: fnmad_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmad z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 4 x float> %m1
+ %mul = fmul fast <vscale x 4 x float> %neg_m1, %m2
+ %res = fsub fast <vscale x 4 x float> %mul, %acc
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fnmad_sx2(<vscale x 2 x float> %m1, <vscale x 2 x float> %m2, <vscale x 2 x float> %acc) {
+; CHECK-LABEL: fnmad_sx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmad z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 2 x float> %m1
+ %mul = fmul fast <vscale x 2 x float> %neg_m1, %m2
+ %res = fsub fast <vscale x 2 x float> %mul, %acc
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fnmad_d(<vscale x 2 x double> %m1, <vscale x 2 x double> %m2, <vscale x 2 x double> %acc) {
+; CHECK-LABEL: fnmad_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmad z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 2 x double> %m1
+ %mul = fmul fast <vscale x 2 x double> %neg_m1, %m2
+ %res = fsub fast <vscale x 2 x double> %mul, %acc
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fnmla_h(<vscale x 8 x half> %acc, <vscale x 8 x half> %m1, <vscale x 8 x half> %m2) {
+; CHECK-LABEL: fnmla_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 8 x half> %m1
+ %mul = fmul fast <vscale x 8 x half> %neg_m1, %m2
+ %res = fsub fast <vscale x 8 x half> %mul, %acc
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fnmla_hx4(<vscale x 4 x half> %acc, <vscale x 4 x half> %m1, <vscale x 4 x half> %m2) {
+; CHECK-LABEL: fnmla_hx4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 4 x half> %m1
+ %mul = fmul fast <vscale x 4 x half> %neg_m1, %m2
+ %res = fsub fast <vscale x 4 x half> %mul, %acc
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fnmla_hx2(<vscale x 2 x half> %acc, <vscale x 2 x half> %m1, <vscale x 2 x half> %m2) {
+; CHECK-LABEL: fnmla_hx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 2 x half> %m1
+ %mul = fmul fast <vscale x 2 x half> %neg_m1, %m2
+ %res = fsub fast <vscale x 2 x half> %mul, %acc
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fnmla_s(<vscale x 4 x float> %acc, <vscale x 4 x float> %m1, <vscale x 4 x float> %m2) {
+; CHECK-LABEL: fnmla_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 4 x float> %m1
+ %mul = fmul fast <vscale x 4 x float> %neg_m1, %m2
+ %res = fsub fast <vscale x 4 x float> %mul, %acc
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fnmla_sx2(<vscale x 2 x float> %acc, <vscale x 2 x float> %m1, <vscale x 2 x float> %m2) {
+; CHECK-LABEL: fnmla_sx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 2 x float> %m1
+ %mul = fmul fast <vscale x 2 x float> %neg_m1, %m2
+ %res = fsub fast <vscale x 2 x float> %mul, %acc
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fnmla_d(<vscale x 2 x double> %acc, <vscale x 2 x double> %m1, <vscale x 2 x double> %m2) {
+; CHECK-LABEL: fnmla_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 2 x double> %m1
+ %mul = fmul fast <vscale x 2 x double> %neg_m1, %m2
+ %res = fsub fast <vscale x 2 x double> %mul, %acc
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fnmla_h_reversed(<vscale x 8 x half> %acc, <vscale x 8 x half> %m1, <vscale x 8 x half> %m2) {
+; CHECK-LABEL: fnmla_h_reversed:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %add = fadd fast <vscale x 8 x half> %mul, %acc
+ %res = fneg fast <vscale x 8 x half> %add
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fnmla_hx4_reversed(<vscale x 4 x half> %acc, <vscale x 4 x half> %m1, <vscale x 4 x half> %m2) {
+; CHECK-LABEL: fnmla_hx4_reversed:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %add = fadd fast <vscale x 4 x half> %mul, %acc
+ %res = fneg fast <vscale x 4 x half> %add
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fnmla_hx2_reversed(<vscale x 2 x half> %acc, <vscale x 2 x half> %m1, <vscale x 2 x half> %m2) {
+; CHECK-LABEL: fnmla_hx2_reversed:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %add = fadd fast <vscale x 2 x half> %mul, %acc
+ %res = fneg fast <vscale x 2 x half> %add
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fnmla_s_reversed(<vscale x 4 x float> %acc, <vscale x 4 x float> %m1, <vscale x 4 x float> %m2) {
+; CHECK-LABEL: fnmla_s_reversed:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %add = fadd fast <vscale x 4 x float> %mul, %acc
+ %res = fneg fast <vscale x 4 x float> %add
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fnmla_sx2_reversed(<vscale x 2 x float> %acc, <vscale x 2 x float> %m1, <vscale x 2 x float> %m2) {
+; CHECK-LABEL: fnmla_sx2_reversed:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %add = fadd fast <vscale x 2 x float> %mul, %acc
+ %res = fneg fast <vscale x 2 x float> %add
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fnmla_d_reversed(<vscale x 2 x double> %acc, <vscale x 2 x double> %m1, <vscale x 2 x double> %m2) {
+; CHECK-LABEL: fnmla_d_reversed:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %add = fadd fast <vscale x 2 x double> %mul, %acc
+ %res = fneg fast <vscale x 2 x double> %add
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fnmls_h(<vscale x 8 x half> %acc, <vscale x 8 x half> %m1, <vscale x 8 x half> %m2) {
+; CHECK-LABEL: fnmls_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fnmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %res = fsub fast <vscale x 8 x half> %mul, %acc
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fnmls_hx4(<vscale x 4 x half> %acc, <vscale x 4 x half> %m1, <vscale x 4 x half> %m2) {
+; CHECK-LABEL: fnmls_hx4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %res = fsub fast <vscale x 4 x half> %mul, %acc
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fnmls_hx2(<vscale x 2 x half> %acc, <vscale x 2 x half> %m1, <vscale x 2 x half> %m2) {
+; CHECK-LABEL: fnmls_hx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %res = fsub fast <vscale x 2 x half> %mul, %acc
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fnmls_s(<vscale x 4 x float> %acc, <vscale x 4 x float> %m1, <vscale x 4 x float> %m2) {
+; CHECK-LABEL: fnmls_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmls z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %res = fsub fast <vscale x 4 x float> %mul, %acc
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fnmls_sx2(<vscale x 2 x float> %acc, <vscale x 2 x float> %m1, <vscale x 2 x float> %m2) {
+; CHECK-LABEL: fnmls_sx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmls z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %res = fsub fast <vscale x 2 x float> %mul, %acc
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fnmls_d(<vscale x 2 x double> %acc, <vscale x 2 x double> %m1, <vscale x 2 x double> %m2) {
+; CHECK-LABEL: fnmls_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmls z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %res = fsub fast <vscale x 2 x double> %mul, %acc
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fnmsb_h(<vscale x 8 x half> %m1, <vscale x 8 x half> %m2, <vscale x 8 x half> %acc) {
+; CHECK-LABEL: fnmsb_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fnmsb z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %res = fsub fast <vscale x 8 x half> %mul, %acc
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fnmsb_hx4(<vscale x 4 x half> %m1, <vscale x 4 x half> %m2, <vscale x 4 x half> %acc) {
+; CHECK-LABEL: fnmsb_hx4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmsb z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %res = fsub fast <vscale x 4 x half> %mul, %acc
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fnmsb_hx2(<vscale x 2 x half> %m1, <vscale x 2 x half> %m2, <vscale x 2 x half> %acc) {
+; CHECK-LABEL: fnmsb_hx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmsb z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %res = fsub fast <vscale x 2 x half> %mul, %acc
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fnmsb_s(<vscale x 4 x float> %m1, <vscale x 4 x float> %m2, <vscale x 4 x float> %acc) {
+; CHECK-LABEL: fnmsb_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmsb z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %res = fsub fast <vscale x 4 x float> %mul, %acc
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fnmsb_sx2(<vscale x 2 x float> %m1, <vscale x 2 x float> %m2, <vscale x 2 x float> %acc) {
+; CHECK-LABEL: fnmsb_sx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmsb z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %res = fsub fast <vscale x 2 x float> %mul, %acc
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fnmsb_d(<vscale x 2 x double> %m1, <vscale x 2 x double> %m2, <vscale x 2 x double> %acc) {
+; CHECK-LABEL: fnmsb_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmsb z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %res = fsub fast <vscale x 2 x double> %mul, %acc
+ ret <vscale x 2 x double> %res
+}