From 8bad8a43c339729bf722d519c3a25708a54bc205 Mon Sep 17 00:00:00 2001 From: Bradley Smith Date: Mon, 8 Feb 2021 16:52:19 +0000 Subject: [PATCH] [AArch64][SVE] Add patterns to generate FMLA/FMLS/FNMLA/FNMLS/FMAD Adjust generateFMAsInMachineCombiner to return false if SVE is present in order to combine fmul+fadd into fma. Also add new pseudo instructions so as to select the most appropriate of FMLA/FMAD depending on register allocation. Depends on D96599 Differential Revision: https://reviews.llvm.org/D96424 --- llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h | 6 - llvm/include/llvm/CodeGen/TargetLowering.h | 7 + llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 +- .../Target/AArch64/AArch64ExpandPseudoInsts.cpp | 28 +- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 5 + llvm/lib/Target/AArch64/AArch64ISelLowering.h | 3 + llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 76 ++- .../lib/Target/AArch64/AArch64SelectionDAGInfo.cpp | 4 - llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h | 1 - llvm/lib/Target/AArch64/SVEInstrFormats.td | 41 +- .../CodeGen/AArch64/sve-fixed-length-fp-arith.ll | 48 +- llvm/test/CodeGen/AArch64/sve-fp-combine.ll | 746 +++++++++++++++++++++ llvm/test/CodeGen/AArch64/sve-fp.ll | 21 +- 13 files changed, 903 insertions(+), 87 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve-fp-combine.ll diff --git a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h index 78f6fc6..722c327 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h @@ -155,12 +155,6 @@ public: return SDValue(); } - // Return true when the decision to generate FMA's (or FMS, FMLA etc) rather - // than FMUL and ADD is delegated to the machine combiner. - virtual bool generateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const { - return false; - } - // Return true if the DAG Combiner should disable generic combines. virtual bool disableGenericCombines(CodeGenOpt::Level OptLevel) const { return false; diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 002b526..5d090f2 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2711,6 +2711,13 @@ public: return isOperationLegal(ISD::FMAD, N->getValueType(0)); } + // Return true when the decision to generate FMA's (or FMS, FMLA etc) rather + // than FMUL and ADD is delegated to the machine combiner. + virtual bool generateFMAsInMachineCombiner(EVT VT, + CodeGenOpt::Level OptLevel) const { + return false; + } + /// Return true if it's profitable to narrow operations of type VT1 to /// VT2. e.g. on x86, it's profitable to narrow from i32 to i8 but not from /// i32 to i16. diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 6a04ba7..737997a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12618,7 +12618,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { if (!AllowFusionGlobally && !isContractable(N)) return SDValue(); - if (STI && STI->generateFMAsInMachineCombiner(OptLevel)) + if (TLI.generateFMAsInMachineCombiner(VT, OptLevel)) return SDValue(); // Always prefer FMAD to FMA for precision. @@ -12827,7 +12827,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (!AllowFusionGlobally && !isContractable(N)) return SDValue(); - if (STI && STI->generateFMAsInMachineCombiner(OptLevel)) + if (TLI.generateFMAsInMachineCombiner(VT, OptLevel)) return SDValue(); // Always prefer FMAD to FMA for precision. diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index f72d3cf..c96777e 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -406,7 +406,7 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( assert(DstReg != MI.getOperand(3).getReg()); bool UseRev = false; - unsigned PredIdx, DOPIdx, SrcIdx; + unsigned PredIdx, DOPIdx, SrcIdx, Src2Idx; switch (DType) { case AArch64::DestructiveBinaryComm: case AArch64::DestructiveBinaryCommWithRev: @@ -420,7 +420,19 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( case AArch64::DestructiveBinary: case AArch64::DestructiveBinaryImm: std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 2, 3); - break; + break; + case AArch64::DestructiveTernaryCommWithRev: + std::tie(PredIdx, DOPIdx, SrcIdx, Src2Idx) = std::make_tuple(1, 2, 3, 4); + if (DstReg == MI.getOperand(3).getReg()) { + // FMLA Zd, Pg, Za, Zd, Zm ==> FMAD Zdn, Pg, Zm, Za + std::tie(PredIdx, DOPIdx, SrcIdx, Src2Idx) = std::make_tuple(1, 3, 4, 2); + UseRev = true; + } else if (DstReg == MI.getOperand(4).getReg()) { + // FMLA Zd, Pg, Za, Zm, Zd ==> FMAD Zdn, Pg, Zm, Za + std::tie(PredIdx, DOPIdx, SrcIdx, Src2Idx) = std::make_tuple(1, 4, 3, 2); + UseRev = true; + } + break; default: llvm_unreachable("Unsupported Destructive Operand type"); } @@ -440,6 +452,12 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( case AArch64::DestructiveBinaryImm: DOPRegIsUnique = true; break; + case AArch64::DestructiveTernaryCommWithRev: + DOPRegIsUnique = + DstReg != MI.getOperand(DOPIdx).getReg() || + (MI.getOperand(DOPIdx).getReg() != MI.getOperand(SrcIdx).getReg() && + MI.getOperand(DOPIdx).getReg() != MI.getOperand(Src2Idx).getReg()); + break; } #endif @@ -522,6 +540,12 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill) .add(MI.getOperand(SrcIdx)); break; + case AArch64::DestructiveTernaryCommWithRev: + DOP.add(MI.getOperand(PredIdx)) + .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill) + .add(MI.getOperand(SrcIdx)) + .add(MI.getOperand(Src2Idx)); + break; } if (PRFX) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 710c9d1..624f89b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11592,6 +11592,11 @@ bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F, } } +bool AArch64TargetLowering::generateFMAsInMachineCombiner( + EVT VT, CodeGenOpt::Level OptLevel) const { + return (OptLevel >= CodeGenOpt::Aggressive) && !VT.isScalableVector(); +} + const MCPhysReg * AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { // LR is a callee-save register, but we must treat it as clobbered by any call diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index c9c7b6f..8db2c23 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -595,6 +595,9 @@ public: EVT VT) const override; bool isFMAFasterThanFMulAndFAdd(const Function &F, Type *Ty) const override; + bool generateFMAsInMachineCombiner(EVT VT, + CodeGenOpt::Level OptLevel) const override; + const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; /// Returns false if N is a bit extraction pattern of (X >> C) & Mask. diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 95ea929..1f11d82 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -458,31 +458,57 @@ let Predicates = [HasSVE] in { defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd", int_aarch64_sve_fcadd>; defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla", int_aarch64_sve_fcmla>; - defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla", int_aarch64_sve_fmla>; - defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls", int_aarch64_sve_fmls>; - defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla", int_aarch64_sve_fnmla>; - defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls", int_aarch64_sve_fnmls>; - - defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad", int_aarch64_sve_fmad>; - defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb", int_aarch64_sve_fmsb>; - defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad>; - defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb>; - - // Add patterns for FMA where disabled lanes are undef. - // FIXME: Implement a pseudo so we can choose a better instruction after - // regalloc. - def : Pat<(nxv8f16 (AArch64fma_p nxv8i1:$P, nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3)), - (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>; - def : Pat<(nxv4f16 (AArch64fma_p nxv4i1:$P, nxv4f16:$Op1, nxv4f16:$Op2, nxv4f16:$Op3)), - (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>; - def : Pat<(nxv2f16 (AArch64fma_p nxv2i1:$P, nxv2f16:$Op1, nxv2f16:$Op2, nxv2f16:$Op3)), - (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>; - def : Pat<(nxv4f32 (AArch64fma_p nxv4i1:$P, nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3)), - (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>; - def : Pat<(nxv2f32 (AArch64fma_p nxv2i1:$P, nxv2f32:$Op1, nxv2f32:$Op2, nxv2f32:$Op3)), - (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>; - def : Pat<(nxv2f64 (AArch64fma_p nxv2i1:$P, nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3)), - (FMLA_ZPmZZ_D $P, $Op3, $Op1, $Op2)>; + defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla", "FMLA_ZPZZZ", int_aarch64_sve_fmla, "FMAD_ZPmZZ">; + defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls", "FMLS_ZPZZZ", int_aarch64_sve_fmls, "FMSB_ZPmZZ">; + defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla", "FNMLA_ZPZZZ", int_aarch64_sve_fnmla, "FNMAD_ZPmZZ">; + defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls", "FNMLS_ZPZZZ", int_aarch64_sve_fnmls, "FNMSB_ZPmZZ">; + + defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad", int_aarch64_sve_fmad, "FMLA_ZPmZZ", /*isReverseInstr*/ 1>; + defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb", int_aarch64_sve_fmsb, "FMLS_ZPmZZ", /*isReverseInstr*/ 1>; + defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad, "FNMLA_ZPmZZ", /*isReverseInstr*/ 1>; + defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb, "FNMLS_ZPmZZ", /*isReverseInstr*/ 1>; + + defm FMLA_ZPZZZ : sve_fp_3op_p_zds_zx; + defm FMLS_ZPZZZ : sve_fp_3op_p_zds_zx; + defm FNMLA_ZPZZZ : sve_fp_3op_p_zds_zx; + defm FNMLS_ZPZZZ : sve_fp_3op_p_zds_zx; + + multiclass fma { + // Zd = Za + Zn * Zm + def : Pat<(Ty (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, Ty:$Za)), + (!cast("FMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; + + // Zd = Za + -Zn * Zm + def : Pat<(Ty (AArch64fma_p PredTy:$P, (AArch64fneg_mt PredTy:$P, Ty:$Zn, (Ty (undef))), Ty:$Zm, Ty:$Za)), + (!cast("FMLS_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; + + // Zd = -Za + Zn * Zm + def : Pat<(Ty (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, (AArch64fneg_mt PredTy:$P, Ty:$Za, (Ty (undef))))), + (!cast("FNMLS_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; + + // Zd = -Za + -Zn * Zm + def : Pat<(Ty (AArch64fma_p PredTy:$P, (AArch64fneg_mt PredTy:$P, Ty:$Zn, (Ty (undef))), Ty:$Zm, (AArch64fneg_mt PredTy:$P, Ty:$Za, (Ty (undef))))), + (!cast("FNMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; + + // Zd = -(Za + Zn * Zm) + def : Pat<(AArch64fneg_mt PredTy:$P, (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, Ty:$Za), (Ty (undef))), + (!cast("FNMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; + + // Zda = Zda + Zn * Zm + def : Pat<(vselect (PredTy PPR:$Pg), (Ty (AArch64fma_p (PredTy (AArch64ptrue 31)), ZPR:$Zn, ZPR:$Zm, ZPR:$Za)), ZPR:$Za), + (!cast("FMLA_ZPmZZ_"#Suffix) PPR:$Pg, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; + + // Zda = Zda + -Zn * Zm + def : Pat<(vselect (PredTy PPR:$Pg), (Ty (AArch64fma_p (PredTy (AArch64ptrue 31)), (AArch64fneg_mt (PredTy (AArch64ptrue 31)), Ty:$Zn, (Ty (undef))), ZPR:$Zm, ZPR:$Za)), ZPR:$Za), + (!cast("FMLS_ZPmZZ_"#Suffix) PPR:$Pg, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; + } + + defm : fma; + defm : fma; + defm : fma; + defm : fma; + defm : fma; + defm : fma; defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>; diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index a5bc366..3eb4c04 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -52,10 +52,6 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( } return SDValue(); } -bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner( - CodeGenOpt::Level OptLevel) const { - return OptLevel >= CodeGenOpt::Aggressive; -} static const int kSetTagLoopThreshold = 176; diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h index d94fd84..7d53bd4 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -28,7 +28,6 @@ public: SDValue Chain, SDValue Op1, SDValue Op2, MachinePointerInfo DstPtrInfo, bool ZeroData) const override; - bool generateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override; }; } diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 9e7ff1c..178c83b 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -491,6 +491,13 @@ let hasNoSchedulingInfo = 1 in { Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, immty:$imm), []> { let FalseLanes = flags; } + + class PredThreeOpPseudo + : SVEPseudo2Instr, + Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2, zprty:$Zs3), []> { + let FalseLanes = flags; + } } //===----------------------------------------------------------------------===// @@ -1762,14 +1769,20 @@ class sve_fp_3op_p_zds_a sz, bits<2> opc, string asm, ZPRRegOp zprty> let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_fp_3op_p_zds_a opc, string asm, SDPatternOperator op> { - def _H : sve_fp_3op_p_zds_a<0b01, opc, asm, ZPR16>; - def _S : sve_fp_3op_p_zds_a<0b10, opc, asm, ZPR32>; - def _D : sve_fp_3op_p_zds_a<0b11, opc, asm, ZPR64>; +multiclass sve_fp_3op_p_zds_a opc, string asm, string Ps, + SDPatternOperator op, string revname, + bit isReverseInstr=0> { + let DestructiveInstType = DestructiveTernaryCommWithRev in { + def _H : sve_fp_3op_p_zds_a<0b01, opc, asm, ZPR16>, + SVEPseudo2Instr, SVEInstr2Rev; + def _S : sve_fp_3op_p_zds_a<0b10, opc, asm, ZPR32>, + SVEPseudo2Instr, SVEInstr2Rev; + def _D : sve_fp_3op_p_zds_a<0b11, opc, asm, ZPR64>, + SVEPseudo2Instr, SVEInstr2Rev; + } def : SVE_4_Op_Pat(NAME # _H)>; def : SVE_4_Op_Pat(NAME # _S)>; @@ -1801,16 +1814,26 @@ class sve_fp_3op_p_zds_b sz, bits<2> opc, string asm, let ElementSize = zprty.ElementSize; } -multiclass sve_fp_3op_p_zds_b opc, string asm, SDPatternOperator op> { - def _H : sve_fp_3op_p_zds_b<0b01, opc, asm, ZPR16>; - def _S : sve_fp_3op_p_zds_b<0b10, opc, asm, ZPR32>; - def _D : sve_fp_3op_p_zds_b<0b11, opc, asm, ZPR64>; +multiclass sve_fp_3op_p_zds_b opc, string asm, SDPatternOperator op, + string revname, bit isReverseInstr> { + def _H : sve_fp_3op_p_zds_b<0b01, opc, asm, ZPR16>, + SVEInstr2Rev; + def _S : sve_fp_3op_p_zds_b<0b10, opc, asm, ZPR32>, + SVEInstr2Rev; + def _D : sve_fp_3op_p_zds_b<0b11, opc, asm, ZPR64>, + SVEInstr2Rev; def : SVE_4_Op_Pat(NAME # _H)>; def : SVE_4_Op_Pat(NAME # _S)>; def : SVE_4_Op_Pat(NAME # _D)>; } +multiclass sve_fp_3op_p_zds_zx { + def _UNDEF_H : PredThreeOpPseudo; + def _UNDEF_S : PredThreeOpPseudo; + def _UNDEF_D : PredThreeOpPseudo; +} + //===----------------------------------------------------------------------===// // SVE Floating Point Multiply-Add - Indexed Group //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll index 407b527..fdd0acd 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll @@ -572,8 +572,8 @@ define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) #0 { ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] ; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2] -; CHECK: fmla [[OP3]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h -; CHECK: st1h { [[OP3]].h }, [[PG]], [x0] +; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h +; CHECK: st1h { [[OP1]].h }, [[PG]], [x0] ; CHECK: ret %op1 = load <16 x half>, <16 x half>* %a %op2 = load <16 x half>, <16 x half>* %b @@ -589,8 +589,8 @@ define void @fma_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x half>* %c) #0 { ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] ; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2] -; CHECK: fmla [[OP3]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h -; CHECK: st1h { [[OP3]].h }, [[PG]], [x0] +; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h +; CHECK: st1h { [[OP1]].h }, [[PG]], [x0] ; CHECK: ret %op1 = load <32 x half>, <32 x half>* %a %op2 = load <32 x half>, <32 x half>* %b @@ -606,8 +606,8 @@ define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) #0 { ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] ; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2] -; CHECK: fmla [[OP3]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h -; CHECK: st1h { [[OP3]].h }, [[PG]], [x0] +; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h +; CHECK: st1h { [[OP1]].h }, [[PG]], [x0] ; CHECK: ret %op1 = load <64 x half>, <64 x half>* %a %op2 = load <64 x half>, <64 x half>* %b @@ -623,8 +623,8 @@ define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) # ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] ; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2] -; CHECK: fmla [[OP3]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h -; CHECK: st1h { [[OP3]].h }, [[PG]], [x0] +; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h +; CHECK: st1h { [[OP1]].h }, [[PG]], [x0] ; CHECK: ret %op1 = load <128 x half>, <128 x half>* %a %op2 = load <128 x half>, <128 x half>* %b @@ -658,8 +658,8 @@ define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) #0 { ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2] -; CHECK: fmla [[OP3]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s -; CHECK: st1w { [[OP3]].s }, [[PG]], [x0] +; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s +; CHECK: st1w { [[OP1]].s }, [[PG]], [x0] ; CHECK: ret %op1 = load <8 x float>, <8 x float>* %a %op2 = load <8 x float>, <8 x float>* %b @@ -675,8 +675,8 @@ define void @fma_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x float>* %c) #0 ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2] -; CHECK: fmla [[OP3]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s -; CHECK: st1w { [[OP3]].s }, [[PG]], [x0] +; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s +; CHECK: st1w { [[OP1]].s }, [[PG]], [x0] ; CHECK: ret %op1 = load <16 x float>, <16 x float>* %a %op2 = load <16 x float>, <16 x float>* %b @@ -692,8 +692,8 @@ define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) #0 ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2] -; CHECK: fmla [[OP3]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s -; CHECK: st1w { [[OP3]].s }, [[PG]], [x0] +; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s +; CHECK: st1w { [[OP1]].s }, [[PG]], [x0] ; CHECK: ret %op1 = load <32 x float>, <32 x float>* %a %op2 = load <32 x float>, <32 x float>* %b @@ -709,8 +709,8 @@ define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) #0 ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2] -; CHECK: fmla [[OP3]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s -; CHECK: st1w { [[OP3]].s }, [[PG]], [x0] +; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s +; CHECK: st1w { [[OP1]].s }, [[PG]], [x0] ; CHECK: ret %op1 = load <64 x float>, <64 x float>* %a %op2 = load <64 x float>, <64 x float>* %b @@ -744,8 +744,8 @@ define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) #0 ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2] -; CHECK: fmla [[OP3]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d -; CHECK: st1d { [[OP3]].d }, [[PG]], [x0] +; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d +; CHECK: st1d { [[OP1]].d }, [[PG]], [x0] ; CHECK: ret %op1 = load <4 x double>, <4 x double>* %a %op2 = load <4 x double>, <4 x double>* %b @@ -761,8 +761,8 @@ define void @fma_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x double>* %c) #0 ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2] -; CHECK: fmla [[OP3]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d -; CHECK: st1d { [[OP3]].d }, [[PG]], [x0] +; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d +; CHECK: st1d { [[OP1]].d }, [[PG]], [x0] ; CHECK: ret %op1 = load <8 x double>, <8 x double>* %a %op2 = load <8 x double>, <8 x double>* %b @@ -778,8 +778,8 @@ define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c) ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2] -; CHECK: fmla [[OP3]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d -; CHECK: st1d { [[OP3]].d }, [[PG]], [x0] +; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d +; CHECK: st1d { [[OP1]].d }, [[PG]], [x0] ; CHECK: ret %op1 = load <16 x double>, <16 x double>* %a %op2 = load <16 x double>, <16 x double>* %b @@ -795,8 +795,8 @@ define void @fma_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x double>* %c) ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2] -; CHECK: fmla [[OP3]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d -; CHECK: st1d { [[OP3]].d }, [[PG]], [x0] +; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d +; CHECK: st1d { [[OP1]].d }, [[PG]], [x0] ; CHECK: ret %op1 = load <32 x double>, <32 x double>* %a %op2 = load <32 x double>, <32 x double>* %b diff --git a/llvm/test/CodeGen/AArch64/sve-fp-combine.ll b/llvm/test/CodeGen/AArch64/sve-fp-combine.ll new file mode 100644 index 0000000..6fcf45d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fp-combine.ll @@ -0,0 +1,746 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s + +define @fmla_h_sel( %pred, %acc, %m1, %m2) { +; CHECK-LABEL: fmla_h_sel: +; CHECK: // %bb.0: +; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %add = fadd fast %acc, %mul + %res = select %pred, %add, %acc + ret %res +} + +define @fmla_hx4_sel( %pred, %acc, %m1, %m2) { +; CHECK-LABEL: fmla_hx4_sel: +; CHECK: // %bb.0: +; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %add = fadd fast %acc, %mul + %res = select %pred, %add, %acc + ret %res +} + +define @fmla_hx2_sel( %pred, %acc, %m1, %m2) { +; CHECK-LABEL: fmla_hx2_sel: +; CHECK: // %bb.0: +; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %add = fadd fast %acc, %mul + %res = select %pred, %add, %acc + ret %res +} + +define @fmla_s_sel( %pred, %acc, %m1, %m2) { +; CHECK-LABEL: fmla_s_sel: +; CHECK: // %bb.0: +; CHECK-NEXT: fmla z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %add = fadd fast %acc, %mul + %res = select %pred, %add, %acc + ret %res +} + +define @fmla_sx2_sel( %pred, %acc, %m1, %m2) { +; CHECK-LABEL: fmla_sx2_sel: +; CHECK: // %bb.0: +; CHECK-NEXT: fmla z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %add = fadd fast %acc, %mul + %res = select %pred, %add, %acc + ret %res +} + +define @fmla_d_sel( %pred, %acc, %m1, %m2) { +; CHECK-LABEL: fmla_d_sel: +; CHECK: // %bb.0: +; CHECK-NEXT: fmla z0.d, p0/m, z1.d, z2.d +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %add = fadd fast %acc, %mul + %res = select %pred, %add, %acc + ret %res +} + +define @fmls_h_sel( %pred, %acc, %m1, %m2) { +; CHECK-LABEL: fmls_h_sel: +; CHECK: // %bb.0: +; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %sub = fsub fast %acc, %mul + %res = select %pred, %sub, %acc + ret %res +} + +define @fmls_hx4_sel( %pred, %acc, %m1, %m2) { +; CHECK-LABEL: fmls_hx4_sel: +; CHECK: // %bb.0: +; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %sub = fsub fast %acc, %mul + %res = select %pred, %sub, %acc + ret %res +} + +define @fmls_hx2_sel( %pred, %acc, %m1, %m2) { +; CHECK-LABEL: fmls_hx2_sel: +; CHECK: // %bb.0: +; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %sub = fsub fast %acc, %mul + %res = select %pred, %sub, %acc + ret %res +} + +define @fmls_s_sel( %pred, %acc, %m1, %m2) { +; CHECK-LABEL: fmls_s_sel: +; CHECK: // %bb.0: +; CHECK-NEXT: fmls z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %sub = fsub fast %acc, %mul + %res = select %pred, %sub, %acc + ret %res +} + +define @fmls_sx2_sel( %pred, %acc, %m1, %m2) { +; CHECK-LABEL: fmls_sx2_sel: +; CHECK: // %bb.0: +; CHECK-NEXT: fmls z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %sub = fsub fast %acc, %mul + %res = select %pred, %sub, %acc + ret %res +} + +define @fmls_d_sel( %pred, %acc, %m1, %m2) { +; CHECK-LABEL: fmls_d_sel: +; CHECK: // %bb.0: +; CHECK-NEXT: fmls z0.d, p0/m, z1.d, z2.d +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %sub = fsub fast %acc, %mul + %res = select %pred, %sub, %acc + ret %res +} + +define @fmad_h( %m1, %m2, %acc) { +; CHECK-LABEL: fmad_h: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fadd fast %acc, %mul + ret %res +} + +define @fmad_hx4( %m1, %m2, %acc) { +; CHECK-LABEL: fmad_hx4: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fadd fast %acc, %mul + ret %res +} + +define @fmad_hx2( %m1, %m2, %acc) { +; CHECK-LABEL: fmad_hx2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fadd fast %acc, %mul + ret %res +} + +define @fmad_s( %m1, %m2, %acc) { +; CHECK-LABEL: fmad_s: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fadd fast %acc, %mul + ret %res +} + +define @fmad_sx2( %m1, %m2, %acc) { +; CHECK-LABEL: fmad_sx2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fadd fast %acc, %mul + ret %res +} + +define @fmad_d( %m1, %m2, %acc) { +; CHECK-LABEL: fmad_d: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fadd fast %acc, %mul + ret %res +} + +define @fmla_h( %acc, %m1, %m2) { +; CHECK-LABEL: fmla_h: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fadd fast %acc, %mul + ret %res +} + +define @fmla_hx4( %acc, %m1, %m2) { +; CHECK-LABEL: fmla_hx4: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fadd fast %acc, %mul + ret %res +} + +define @fmla_hx2( %acc, %m1, %m2) { +; CHECK-LABEL: fmla_hx2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fadd fast %acc, %mul + ret %res +} + +define @fmla_s( %acc, %m1, %m2) { +; CHECK-LABEL: fmla_s: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmla z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fadd fast %acc, %mul + ret %res +} + +define @fmla_sx2( %acc, %m1, %m2) { +; CHECK-LABEL: fmla_sx2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmla z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fadd fast %acc, %mul + ret %res +} + +define @fmla_d( %acc, %m1, %m2) { +; CHECK-LABEL: fmla_d: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmla z0.d, p0/m, z1.d, z2.d +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fadd fast %acc, %mul + ret %res +} + +define @fmls_h( %acc, %m1, %m2) { +; CHECK-LABEL: fmls_h: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %acc, %mul + ret %res +} + +define @fmls_hx4( %acc, %m1, %m2) { +; CHECK-LABEL: fmls_hx4: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %acc, %mul + ret %res +} + +define @fmls_hx2( %acc, %m1, %m2) { +; CHECK-LABEL: fmls_hx2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %acc, %mul + ret %res +} + +define @fmls_s( %acc, %m1, %m2) { +; CHECK-LABEL: fmls_s: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmls z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %acc, %mul + ret %res +} + +define @fmls_sx2( %acc, %m1, %m2) { +; CHECK-LABEL: fmls_sx2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmls z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %acc, %mul + ret %res +} + +define @fmls_d( %acc, %m1, %m2) { +; CHECK-LABEL: fmls_d: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmls z0.d, p0/m, z1.d, z2.d +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %acc, %mul + ret %res +} + +define @fmsb_h( %m1, %m2, %acc) { +; CHECK-LABEL: fmsb_h: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fmsb z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %acc, %mul + ret %res +} + +define @fmsb_hx4( %m1, %m2, %acc) { +; CHECK-LABEL: fmsb_hx4: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmsb z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %acc, %mul + ret %res +} + +define @fmsb_hx2( %m1, %m2, %acc) { +; CHECK-LABEL: fmsb_hx2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmsb z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %acc, %mul + ret %res +} + +define @fmsb_s( %m1, %m2, %acc) { +; CHECK-LABEL: fmsb_s: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmsb z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %acc, %mul + ret %res +} + +define @fmsb_sx2( %m1, %m2, %acc) { +; CHECK-LABEL: fmsb_sx2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmsb z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %acc, %mul + ret %res +} + +define @fmsb_d( %m1, %m2, %acc) { +; CHECK-LABEL: fmsb_d: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmsb z0.d, p0/m, z1.d, z2.d +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %acc, %mul + ret %res +} + +define @fnmad_h( %m1, %m2, %acc) { +; CHECK-LABEL: fnmad_h: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fnmad z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %neg_m1 = fneg fast %m1 + %mul = fmul fast %neg_m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmad_hx4( %m1, %m2, %acc) { +; CHECK-LABEL: fnmad_hx4: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fnmad z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %neg_m1 = fneg fast %m1 + %mul = fmul fast %neg_m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmad_hx2( %m1, %m2, %acc) { +; CHECK-LABEL: fnmad_hx2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fnmad z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %neg_m1 = fneg fast %m1 + %mul = fmul fast %neg_m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmad_s( %m1, %m2, %acc) { +; CHECK-LABEL: fnmad_s: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fnmad z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %neg_m1 = fneg fast %m1 + %mul = fmul fast %neg_m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmad_sx2( %m1, %m2, %acc) { +; CHECK-LABEL: fnmad_sx2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fnmad z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %neg_m1 = fneg fast %m1 + %mul = fmul fast %neg_m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmad_d( %m1, %m2, %acc) { +; CHECK-LABEL: fnmad_d: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fnmad z0.d, p0/m, z1.d, z2.d +; CHECK-NEXT: ret + %neg_m1 = fneg fast %m1 + %mul = fmul fast %neg_m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmla_h( %acc, %m1, %m2) { +; CHECK-LABEL: fnmla_h: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %neg_m1 = fneg fast %m1 + %mul = fmul fast %neg_m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmla_hx4( %acc, %m1, %m2) { +; CHECK-LABEL: fnmla_hx4: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %neg_m1 = fneg fast %m1 + %mul = fmul fast %neg_m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmla_hx2( %acc, %m1, %m2) { +; CHECK-LABEL: fnmla_hx2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %neg_m1 = fneg fast %m1 + %mul = fmul fast %neg_m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmla_s( %acc, %m1, %m2) { +; CHECK-LABEL: fnmla_s: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fnmla z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %neg_m1 = fneg fast %m1 + %mul = fmul fast %neg_m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmla_sx2( %acc, %m1, %m2) { +; CHECK-LABEL: fnmla_sx2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fnmla z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %neg_m1 = fneg fast %m1 + %mul = fmul fast %neg_m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmla_d( %acc, %m1, %m2) { +; CHECK-LABEL: fnmla_d: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fnmla z0.d, p0/m, z1.d, z2.d +; CHECK-NEXT: ret + %neg_m1 = fneg fast %m1 + %mul = fmul fast %neg_m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmla_h_reversed( %acc, %m1, %m2) { +; CHECK-LABEL: fnmla_h_reversed: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %add = fadd fast %mul, %acc + %res = fneg fast %add + ret %res +} + +define @fnmla_hx4_reversed( %acc, %m1, %m2) { +; CHECK-LABEL: fnmla_hx4_reversed: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %add = fadd fast %mul, %acc + %res = fneg fast %add + ret %res +} + +define @fnmla_hx2_reversed( %acc, %m1, %m2) { +; CHECK-LABEL: fnmla_hx2_reversed: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %add = fadd fast %mul, %acc + %res = fneg fast %add + ret %res +} + +define @fnmla_s_reversed( %acc, %m1, %m2) { +; CHECK-LABEL: fnmla_s_reversed: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fnmla z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %add = fadd fast %mul, %acc + %res = fneg fast %add + ret %res +} + +define @fnmla_sx2_reversed( %acc, %m1, %m2) { +; CHECK-LABEL: fnmla_sx2_reversed: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fnmla z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %add = fadd fast %mul, %acc + %res = fneg fast %add + ret %res +} + +define @fnmla_d_reversed( %acc, %m1, %m2) { +; CHECK-LABEL: fnmla_d_reversed: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fnmla z0.d, p0/m, z1.d, z2.d +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %add = fadd fast %mul, %acc + %res = fneg fast %add + ret %res +} + +define @fnmls_h( %acc, %m1, %m2) { +; CHECK-LABEL: fnmls_h: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fnmls z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmls_hx4( %acc, %m1, %m2) { +; CHECK-LABEL: fnmls_hx4: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fnmls z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmls_hx2( %acc, %m1, %m2) { +; CHECK-LABEL: fnmls_hx2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fnmls z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmls_s( %acc, %m1, %m2) { +; CHECK-LABEL: fnmls_s: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fnmls z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmls_sx2( %acc, %m1, %m2) { +; CHECK-LABEL: fnmls_sx2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fnmls z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmls_d( %acc, %m1, %m2) { +; CHECK-LABEL: fnmls_d: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fnmls z0.d, p0/m, z1.d, z2.d +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmsb_h( %m1, %m2, %acc) { +; CHECK-LABEL: fnmsb_h: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fnmsb z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmsb_hx4( %m1, %m2, %acc) { +; CHECK-LABEL: fnmsb_hx4: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fnmsb z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmsb_hx2( %m1, %m2, %acc) { +; CHECK-LABEL: fnmsb_hx2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fnmsb z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmsb_s( %m1, %m2, %acc) { +; CHECK-LABEL: fnmsb_s: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fnmsb z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmsb_sx2( %m1, %m2, %acc) { +; CHECK-LABEL: fnmsb_sx2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fnmsb z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} + +define @fnmsb_d( %m1, %m2, %acc) { +; CHECK-LABEL: fnmsb_d: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fnmsb z0.d, p0/m, z1.d, z2.d +; CHECK-NEXT: ret + %mul = fmul fast %m1, %m2 + %res = fsub fast %mul, %acc + ret %res +} diff --git a/llvm/test/CodeGen/AArch64/sve-fp.ll b/llvm/test/CodeGen/AArch64/sve-fp.ll index 2c1f8df..b441cb7 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp.ll @@ -240,8 +240,7 @@ define @fma_nxv8f16( %a, @llvm.fma.nxv8f16( %a, %b, %c) ret %r @@ -251,8 +250,7 @@ define @fma_nxv4f16( %a, @llvm.fma.nxv4f16( %a, %b, %c) ret %r @@ -262,8 +260,7 @@ define @fma_nxv2f16( %a, @llvm.fma.nxv2f16( %a, %b, %c) ret %r @@ -273,8 +270,7 @@ define @fma_nxv4f32( %a, @llvm.fma.nxv4f32( %a, %b, %c) ret %r @@ -284,8 +280,7 @@ define @fma_nxv2f32( %a, @llvm.fma.nxv2f32( %a, %b, %c) ret %r @@ -295,8 +290,7 @@ define @fma_nxv2f64_1( %a, @llvm.fma.nxv2f64( %a, %b, %c) ret %r @@ -306,8 +300,7 @@ define @fma_nxv2f64_2( %a, @llvm.fma.nxv2f64( %b, %a, %c) ret %r -- 2.7.4